1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock *, fl_blocker) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_pid) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; __entry->fl_pid = fl ? fl->fl_pid : 0; __entry->fl_flags = fl ? fl->fl_flags : 0; __entry->fl_type = fl ? fl->fl_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->fl_blocker, __entry->fl_owner, __entry->fl_pid, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock *, fl_blocker) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) __field(unsigned long, fl_break_time) __field(unsigned long, fl_downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; __entry->fl_flags = fl ? fl->fl_flags : 0; __entry->fl_type = fl ? fl->fl_type : 0; __entry->fl_break_time = fl ? fl->fl_break_time : 0; __entry->fl_downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->fl_blocker, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type), __entry->fl_break_time, __entry->fl_downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); __entry->fl_owner = fl->fl_owner; __entry->fl_flags = fl->fl_flags; __entry->fl_type = fl->fl_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lock *lease, struct file_lock *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->fl_flags; __entry->l_fl_type = lease->fl_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->fl_flags; __entry->b_fl_type = breaker->fl_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1401 1254 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_INTERNAL_H #define _LINUX_HIGHMEM_INTERNAL_H /* * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); void __kmap_local_sched_in(void); static inline void kmap_assert_nomap(void) { DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); } #else static inline void kmap_local_fork(struct task_struct *tsk) { } static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM #include <asm/highmem.h> #ifndef ARCH_HAS_KMAP_FLUSH_TLB static inline void kmap_flush_tlb(unsigned long addr) { } #endif #ifndef kmap_prot #define kmap_prot PAGE_KERNEL #endif void *kmap_high(struct page *page); void kunmap_high(struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); static inline void *kmap(struct page *page) { void *addr; might_sleep(); if (!PageHighMem(page)) addr = page_address(page); else addr = kmap_high(page); kmap_flush_tlb((unsigned long)addr); return addr; } static inline void kunmap(struct page *page) { might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); } static inline struct page *kmap_to_page(void *addr) { return __kmap_to_page(addr); } static inline void kmap_flush_unused(void) { __kmap_flush_unused(); } static inline void *kmap_local_page(struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } static inline void *kmap_local_pfn(unsigned long pfn) { return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_local(const void *vaddr) { kunmap_local_indexed(vaddr); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_page_prot(page, prot); } static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } static inline void *kmap_atomic_pfn(unsigned long pfn) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_atomic(const void *addr) { kunmap_local_indexed(addr); pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } unsigned int __nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; static inline unsigned int nr_free_highpages(void) { return __nr_free_highpages(); } static inline unsigned long totalhigh_pages(void) { return (unsigned long)atomic_long_read(&_totalhigh_pages); } static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); } static inline bool is_kmap_addr(const void *x) { unsigned long addr = (unsigned long)x; return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) || (addr >= __fix_to_virt(FIX_KMAP_END) && addr < __fix_to_virt(FIX_KMAP_BEGIN)); } #else /* CONFIG_HIGHMEM */ static inline struct page *kmap_to_page(void *addr) { return virt_to_page(addr); } static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } static inline void kunmap_high(struct page *page) { } static inline void kmap_flush_unused(void) { } static inline void kunmap(struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } static inline void *kmap_local_page(struct page *page) { return page_address(page); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { return page_address(&folio->page) + offset; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); } static inline void *kmap_local_pfn(unsigned long pfn) { return kmap_local_page(pfn_to_page(pfn)); } static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } static inline void *kmap_atomic(struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return page_address(page); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { return kmap_atomic(page); } static inline void *kmap_atomic_pfn(unsigned long pfn) { return kmap_atomic(pfn_to_page(pfn)); } static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } static inline unsigned int nr_free_highpages(void) { return 0; } static inline unsigned long totalhigh_pages(void) { return 0UL; } static inline bool is_kmap_addr(const void *x) { return false; } #endif /* CONFIG_HIGHMEM */ /** * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated! * @__addr: Virtual address to be unmapped * * Unmaps an address previously mapped by kmap_atomic() and re-enables * pagefaults. Depending on PREEMP_RT configuration, re-enables also * migration and preemption. Users should not count on these side effects. * * Mappings should be unmapped in the reverse order that they were mapped. * See kmap_local_page() for details on nesting. * * @__addr can be any address within the mapped page, so there is no need * to subtract any offset that has been added. In contrast to kunmap(), * this function takes the address returned from kmap_atomic(), not the * page passed to it. The compiler will warn you if you pass the page. */ #define kunmap_atomic(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_atomic(__addr); \ } while (0) /** * kunmap_local - Unmap a page mapped via kmap_local_page(). * @__addr: An address within the page mapped * * @__addr can be any address within the mapped page. Commonly it is the * address return from kmap_local_page(), but it can also include offsets. * * Unmapping should be done in the reverse order of the mapping. See * kmap_local_page() for details. */ #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_local(__addr); \ } while (0) #endif
17714 1916 17706 17712 12714 12714 14226 14226 452 14179 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0 #include <linux/memblock.h> #include <linux/mmdebug.h> #include <linux/export.h> #include <linux/mm.h> #include <asm/page.h> #include <linux/vmalloc.h> #include "physaddr.h" #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { x = y + phys_base; VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); } else { x = y + (__START_KERNEL_map - PAGE_OFFSET); /* carry flag will be set if starting x was >= PAGE_OFFSET */ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); } return x; } EXPORT_SYMBOL(__phys_addr); unsigned long __phys_addr_symbol(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* only check upper bounds since lower bounds will trigger carry */ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); return y + phys_base; } EXPORT_SYMBOL(__phys_addr_symbol); #endif bool __virt_addr_valid(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { x = y + phys_base; if (y >= KERNEL_IMAGE_SIZE) return false; } else { x = y + (__START_KERNEL_map - PAGE_OFFSET); /* carry flag will be set if starting x was >= PAGE_OFFSET */ if ((x > y) || !phys_addr_valid(x)) return false; } return pfn_valid(x >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); #else #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { unsigned long phys_addr = x - PAGE_OFFSET; /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); /* max_low_pfn is set early, but not _that_ early */ if (max_low_pfn) { VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); } return phys_addr; } EXPORT_SYMBOL(__phys_addr); #endif bool __virt_addr_valid(unsigned long x) { if (x < PAGE_OFFSET) return false; if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) return false; if (x >= FIXADDR_START) return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); #endif /* CONFIG_X86_64 */
95 90 115 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM page_pool #if !defined(_TRACE_PAGE_POOL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGE_POOL_H #include <linux/types.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> #include <net/page_pool/types.h> TRACE_EVENT(page_pool_release, TP_PROTO(const struct page_pool *pool, s32 inflight, u32 hold, u32 release), TP_ARGS(pool, inflight, hold, release), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(s32, inflight) __field(u32, hold) __field(u32, release) __field(u64, cnt) ), TP_fast_assign( __entry->pool = pool; __entry->inflight = inflight; __entry->hold = hold; __entry->release = release; __entry->cnt = pool->destroy_cnt; ), TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu", __entry->pool, __entry->inflight, __entry->hold, __entry->release, __entry->cnt) ); TRACE_EVENT(page_pool_state_release, TP_PROTO(const struct page_pool *pool, const struct page *page, u32 release), TP_ARGS(pool, page, release), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, release) __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->release = release; __entry->pfn = page_to_pfn(page); ), TP_printk("page_pool=%p page=%p pfn=0x%lx release=%u", __entry->pool, __entry->page, __entry->pfn, __entry->release) ); TRACE_EVENT(page_pool_state_hold, TP_PROTO(const struct page_pool *pool, const struct page *page, u32 hold), TP_ARGS(pool, page, hold), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(const struct page *, page) __field(u32, hold) __field(unsigned long, pfn) ), TP_fast_assign( __entry->pool = pool; __entry->page = page; __entry->hold = hold; __entry->pfn = page_to_pfn(page); ), TP_printk("page_pool=%p page=%p pfn=0x%lx hold=%u", __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); TRACE_EVENT(page_pool_update_nid, TP_PROTO(const struct page_pool *pool, int new_nid), TP_ARGS(pool, new_nid), TP_STRUCT__entry( __field(const struct page_pool *, pool) __field(int, pool_nid) __field(int, new_nid) ), TP_fast_assign( __entry->pool = pool; __entry->pool_nid = pool->p.nid; __entry->new_nid = new_nid; ), TP_printk("page_pool=%p pool_nid=%d new_nid=%d", __entry->pool, __entry->pool_nid, __entry->new_nid) ); #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
4 4 2 17 3 1464 162 3 1465 105 6 1468 139 23 22 2 4 12 12 11 11 11 12 1 10 7 7 7 1468 12 11 3 8 3 3 2 11 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION);
193 193 193 193 88 44 44 44 193 193 193 193 193 193 193 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP upper layer protocol support. * * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * */ #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <net/tcp.h> static DEFINE_SPINLOCK(tcp_ulp_list_lock); static LIST_HEAD(tcp_ulp_list); /* Simple linear search, don't expect many entries! */ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) { struct tcp_ulp_ops *e; list_for_each_entry_rcu(e, &tcp_ulp_list, list, lockdep_is_held(&tcp_ulp_list_lock)) { if (strcmp(e->name, name) == 0) return e; } return NULL; } static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; rcu_read_lock(); ulp = tcp_ulp_find(name); #ifdef CONFIG_MODULES if (!ulp && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp-ulp-%s", name); rcu_read_lock(); ulp = tcp_ulp_find(name); } #endif if (!ulp || !try_module_get(ulp->owner)) ulp = NULL; rcu_read_unlock(); return ulp; } /* Attach new upper layer protocol to the list * of available protocols. */ int tcp_register_ulp(struct tcp_ulp_ops *ulp) { int ret = 0; spin_lock(&tcp_ulp_list_lock); if (tcp_ulp_find(ulp->name)) ret = -EEXIST; else list_add_tail_rcu(&ulp->list, &tcp_ulp_list); spin_unlock(&tcp_ulp_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_ulp); void tcp_unregister_ulp(struct tcp_ulp_ops *ulp) { spin_lock(&tcp_ulp_list_lock); list_del_rcu(&ulp->list); spin_unlock(&tcp_ulp_list_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_ulp); /* Build string with list of available upper layer protocl values */ void tcp_get_available_ulp(char *buf, size_t maxlen) { struct tcp_ulp_ops *ulp_ops; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ulp_ops->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } void tcp_update_ulp(struct sock *sk, struct proto *proto, void (*write_space)(struct sock *sk)) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ulp_ops->update) icsk->icsk_ulp_ops->update(sk, proto, write_space); } void tcp_cleanup_ulp(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* No sock_owned_by_me() check here as at the time the * stack calls this function, the socket is dead and * about to be destroyed. */ if (!icsk->icsk_ulp_ops) return; if (icsk->icsk_ulp_ops->release) icsk->icsk_ulp_ops->release(sk); module_put(icsk->icsk_ulp_ops->owner); icsk->icsk_ulp_ops = NULL; } static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) { struct inet_connection_sock *icsk = inet_csk(sk); int err; err = -EEXIST; if (icsk->icsk_ulp_ops) goto out_err; if (sk->sk_socket) clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); err = -ENOTCONN; if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN) goto out_err; err = ulp_ops->init(sk); if (err) goto out_err; icsk->icsk_ulp_ops = ulp_ops; return 0; out_err: module_put(ulp_ops->owner); return err; } int tcp_set_ulp(struct sock *sk, const char *name) { const struct tcp_ulp_ops *ulp_ops; sock_owned_by_me(sk); ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) return -ENOENT; return __tcp_set_ulp(sk, ulp_ops); }
79 77 51 72 5 25 5 67 70 70 66 66 70 70 1 69 66 53 53 14 52 53 51 39 4 39 39 12 41 12 12 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/pipe.c * * Copyright (C) 1991, 1992, 1999 Linus Torvalds */ #include <linux/mm.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/magic.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <linux/memcontrol.h> #include <linux/watch_queue.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include "internal.h" /* * New pipe buffers will be restricted to this size while the user is exceeding * their pipe buffer quota. The general pipe use case needs at least two * buffers: one for data yet to be read, and one for new data. If this is less * than two, then a write to a non-empty pipe may block even if the pipe is not * full. This can occur with GNU make jobserver or similar uses of pipes as * semaphores: multiple processes may be waiting to write tokens back to the * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/. * * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their * own risk, namely: pipe writes to non-full pipes may block until the pipe is * emptied. */ #define PIPE_MIN_DEF_BUFFERS 2 /* * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size */ static unsigned int pipe_max_size = 1048576; /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ static unsigned long pipe_user_pages_hard; static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* * We use head and tail indices that aren't masked off, except at the point of * dereference, but rather they're allowed to wrap naturally. This means there * isn't a dead spot in the buffer, but the ring has to be a power of two and * <= 2^31. * -- David Howells 2019-09-23. * * Reads with count = 0 should always return 0. * -- Julian Bradfield 1999-06-07. * * FIFOs and Pipes now generate SIGIO for both readers and writers. * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 * * pipe_read & write cleanup * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) { if (pipe->files) mutex_lock_nested(&pipe->mutex, subclass); } void pipe_lock(struct pipe_inode_info *pipe) { /* * pipe_lock() nests non-pipe inode locks (for writing to a file) */ pipe_lock_nested(pipe, I_MUTEX_PARENT); } EXPORT_SYMBOL(pipe_lock); void pipe_unlock(struct pipe_inode_info *pipe) { if (pipe->files) mutex_unlock(&pipe->mutex); } EXPORT_SYMBOL(pipe_unlock); static inline void __pipe_lock(struct pipe_inode_info *pipe) { mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); } static inline void __pipe_unlock(struct pipe_inode_info *pipe) { mutex_unlock(&pipe->mutex); } void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); if (pipe1 < pipe2) { pipe_lock_nested(pipe1, I_MUTEX_PARENT); pipe_lock_nested(pipe2, I_MUTEX_CHILD); } else { pipe_lock_nested(pipe2, I_MUTEX_PARENT); pipe_lock_nested(pipe1, I_MUTEX_CHILD); } } static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep * allocation cache. (Otherwise just release our reference to it) */ if (page_count(page) == 1 && !pipe->tmp_page) pipe->tmp_page = page; else put_page(page); } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; if (page_count(page) != 1) return false; memcg_kmem_uncharge_page(page, 0); __SetPageLocked(page); return true; } /** * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal * * Description: * This function attempts to steal the &struct page attached to * @buf. If successful, this function returns 0 and returns with * the page locked. The caller may then reuse the page for whatever * he wishes; the typical use is insertion into a different file * page cache. */ bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; /* * A reference of one is golden, that means that the owner of this * page is the only one holding a reference to it. lock the page * and return OK. */ if (page_count(page) == 1) { lock_page(page); return true; } return false; } EXPORT_SYMBOL(generic_pipe_buf_try_steal); /** * generic_pipe_buf_get - get a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to * * Description: * This function grabs an extra reference to @buf. It's used in * the tee() system call, when we duplicate the buffers in one * pipe into another. */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); /** * generic_pipe_buf_release - put a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to put a reference to * * Description: * This function releases a reference to @buf. */ void generic_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .release = anon_pipe_buf_release, .try_steal = anon_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_readable(const struct pipe_inode_info *pipe) { unsigned int head = READ_ONCE(pipe->head); unsigned int tail = READ_ONCE(pipe->tail); unsigned int writers = READ_ONCE(pipe->writers); return !pipe_empty(head, tail) || !writers; } static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, struct pipe_buffer *buf, unsigned int tail) { pipe_buf_release(pipe, buf); /* * If the pipe has a watch_queue, we need additional protection * by the spinlock because notifications get posted with only * this spinlock, no mutex */ if (pipe_has_watch_queue(pipe)) { spin_lock_irq(&pipe->rd_wait.lock); #ifdef CONFIG_WATCH_QUEUE if (buf->flags & PIPE_BUF_FLAG_LOSS) pipe->note_loss = true; #endif pipe->tail = ++tail; spin_unlock_irq(&pipe->rd_wait.lock); return tail; } /* * Without a watch_queue, we can simply increment the tail * without the spinlock - the mutex is enough. */ pipe->tail = ++tail; return tail; } static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; bool was_full, wake_next_reader = false; ssize_t ret; /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; ret = 0; __pipe_lock(pipe); /* * We only wake up writers if the pipe was full when we started * reading in order to avoid unnecessary wakeups. * * But when we do wake up writers, we do so using a sync wakeup * (WF_SYNC), because we want them to get going and generate more * data for us. */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { struct watch_notification n; if (total_len < 8) { if (ret == 0) ret = -ENOBUFS; break; } n.type = WATCH_TYPE_META; n.subtype = WATCH_META_LOSS_NOTIFICATION; n.info = watch_sizeof(n); if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) { if (ret == 0) ret = -EFAULT; break; } ret += sizeof(n); total_len -= sizeof(n); pipe->note_loss = false; } #endif if (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t chars = buf->len; size_t written; int error; if (chars > total_len) { if (buf->flags & PIPE_BUF_FLAG_WHOLE) { if (ret == 0) ret = -ENOBUFS; break; } chars = total_len; } error = pipe_buf_confirm(pipe, buf); if (error) { if (!ret) ret = error; break; } written = copy_page_to_iter(buf->page, buf->offset, chars, to); if (unlikely(written < chars)) { if (!ret) ret = -EFAULT; break; } ret += chars; buf->offset += chars; buf->len -= chars; /* Was it a packet buffer? Clean up and exit */ if (buf->flags & PIPE_BUF_FLAG_PACKET) { total_len = chars; buf->len = 0; } if (!buf->len) tail = pipe_update_tail(pipe, buf, tail); total_len -= chars; if (!total_len) break; /* common path: read succeeded */ if (!pipe_empty(head, tail)) /* More to do? */ continue; } if (!pipe->writers) break; if (ret) break; if ((filp->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { ret = -EAGAIN; break; } __pipe_unlock(pipe); /* * We only get here if we didn't actually read anything. * * However, we could have seen (and removed) a zero-sized * pipe buffer, and might have made space in the buffers * that way. * * You can't make zero-sized pipe buffers by doing an empty * write (not even in packet mode), but they can happen if * the writer gets an EFAULT when trying to fill a buffer * that already got allocated and inserted in the buffer * array. * * So we still need to wake up any pending writers in the * _very_ unlikely case that the pipe was full, but we got * no data. */ if (unlikely(was_full)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need * to mark anything accessed. And we've dropped the lock. */ if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; __pipe_lock(pipe); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; __pipe_unlock(pipe); if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); if (ret > 0) file_accessed(filp); return ret; } static inline int is_packetized(struct file *file) { return (file->f_flags & O_DIRECT) != 0; } /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_writable(const struct pipe_inode_info *pipe) { unsigned int head = READ_ONCE(pipe->head); unsigned int tail = READ_ONCE(pipe->tail); unsigned int max_usage = READ_ONCE(pipe->max_usage); return !pipe_full(head, tail, max_usage) || !READ_ONCE(pipe->readers); } static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; unsigned int head; ssize_t ret = 0; size_t total_len = iov_iter_count(from); ssize_t chars; bool was_empty = false; bool wake_next_writer = false; /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; __pipe_lock(pipe); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } if (pipe_has_watch_queue(pipe)) { ret = -EXDEV; goto out; } /* * If it wasn't empty we try to merge new data into * the last buffer. * * That naturally merges small writes, but it also * page-aligns the rest of the writes for large writes * spanning multiple pages. */ head = pipe->head; was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; ret = copy_page_from_iter(buf->page, offset, chars, from); if (unlikely(ret < chars)) { ret = -EFAULT; goto out; } buf->len += ret; if (!iov_iter_count(from)) goto out; } } for (;;) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; struct page *page = pipe->tmp_page; int copied; if (!page) { page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; } pipe->tmp_page = page; } /* Allocate a slot in the ring in advance and attach an * empty buffer. If we fault or otherwise fail to use * it, either the reader will consume it or it'll still * be there for the next write. */ pipe->head = head + 1; /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; pipe->tmp_page = NULL; copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) ret = -EFAULT; break; } ret += copied; buf->len = copied; if (!iov_iter_count(from)) break; } if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; /* Wait for buffer space to become available. */ if ((filp->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { if (!ret) ret = -EAGAIN; break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } /* * We're going to release the pipe lock and wait for more * space. We wake up any readers if necessary, and then * after waiting we need to re-check whether the pipe * become empty while we dropped the lock. */ __pipe_unlock(pipe); if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); __pipe_lock(pipe); was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } out: if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) wake_next_writer = false; __pipe_unlock(pipe); /* * If we do do a wakeup event, we do a 'sync' wakeup, because we * want the reader to start processing things asap, rather than * leave the data pending. * * This is particularly important for small writes, because of * how (for example) the GNU make jobserver uses small writes to * wake up pending jobs * * Epoll nonsensically wants a wakeup whether the pipe * was already empty or not. */ if (was_empty || pipe->poll_usage) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; sb_end_write(file_inode(filp)->i_sb); } return ret; } static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; unsigned int count, head, tail, mask; switch (cmd) { case FIONREAD: __pipe_lock(pipe); count = 0; head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; while (tail != head) { count += pipe->bufs[tail & mask].len; tail++; } __pipe_unlock(pipe); return put_user(count, (int __user *)arg); #ifdef CONFIG_WATCH_QUEUE case IOC_WATCH_QUEUE_SET_SIZE: { int ret; __pipe_lock(pipe); ret = watch_queue_set_size(pipe, arg); __pipe_unlock(pipe); return ret; } case IOC_WATCH_QUEUE_SET_FILTER: return watch_queue_set_filter( pipe, (struct watch_notification_filter __user *)arg); #endif default: return -ENOIOCTLCMD; } } /* No kernel lock held - fine */ static __poll_t pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; unsigned int head, tail; /* Epoll has some historical nasty semantics, this enables them */ WRITE_ONCE(pipe->poll_usage, true); /* * Reading pipe state only -- no need for acquiring the semaphore. * * But because this is racy, the code has to add the * entry to the poll table _first_ .. */ if (filp->f_mode & FMODE_READ) poll_wait(filp, &pipe->rd_wait, wait); if (filp->f_mode & FMODE_WRITE) poll_wait(filp, &pipe->wr_wait, wait); /* * .. and only then can you do the racy tests. That way, * if something changes and you got it wrong, the poll * table entry will wake you up and fix it. */ head = READ_ONCE(pipe->head); tail = READ_ONCE(pipe->tail); mask = 0; if (filp->f_mode & FMODE_READ) { if (!pipe_empty(head, tail)) mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { if (!pipe_full(head, tail, pipe->max_usage)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ if (!pipe->readers) mask |= EPOLLERR; } return mask; } static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) { int kill = 0; spin_lock(&inode->i_lock); if (!--pipe->files) { inode->i_pipe = NULL; kill = 1; } spin_unlock(&inode->i_lock); if (kill) free_pipe_info(pipe); } static int pipe_release(struct inode *inode, struct file *file) { struct pipe_inode_info *pipe = file->private_data; __pipe_lock(pipe); if (file->f_mode & FMODE_READ) pipe->readers--; if (file->f_mode & FMODE_WRITE) pipe->writers--; /* Was that the last reader or writer, but not the other side? */ if (!pipe->readers != !pipe->writers) { wake_up_interruptible_all(&pipe->rd_wait); wake_up_interruptible_all(&pipe->wr_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } __pipe_unlock(pipe); put_pipe_info(inode, pipe); return 0; } static int pipe_fasync(int fd, struct file *filp, int on) { struct pipe_inode_info *pipe = filp->private_data; int retval = 0; __pipe_lock(pipe); if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); if (retval < 0 && (filp->f_mode & FMODE_READ)) /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } __pipe_unlock(pipe); return retval; } unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new) { return atomic_long_add_return(new - old, &user->pipe_bufs); } bool too_many_pipe_buffers_soft(unsigned long user_bufs) { unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); return soft_limit && user_bufs > soft_limit; } bool too_many_pipe_buffers_hard(unsigned long user_bufs) { unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); return hard_limit && user_bufs > hard_limit; } bool pipe_is_unprivileged_user(void) { return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); unsigned long user_bufs; unsigned int max_size = READ_ONCE(pipe_max_size); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) pipe_bufs = max_size >> PAGE_SHIFT; user_bufs = account_pipe_buffers(user, 0, pipe_bufs); if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS); pipe_bufs = PIPE_MIN_DEF_BUFFERS; } if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { init_waitqueue_head(&pipe->rd_wait); init_waitqueue_head(&pipe->wr_wait); pipe->r_counter = pipe->w_counter = 1; pipe->max_usage = pipe_bufs; pipe->ring_size = pipe_bufs; pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; } out_revert_acct: (void) account_pipe_buffers(user, pipe_bufs, 0); kfree(pipe); out_free_uid: free_uid(user); return NULL; } void free_pipe_info(struct pipe_inode_info *pipe) { unsigned int i; #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); free_uid(pipe->user); for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) pipe_buf_release(pipe, buf); } #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); kfree(pipe); } static struct vfsmount *pipe_mnt __ro_after_init; /* * pipefs_dname() is called from d_path(). */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "pipe:[%lu]", d_inode(dentry)->i_ino); } static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) goto fail_inode; inode->i_ino = get_next_ino(); pipe = alloc_pipe_info(); if (!pipe) goto fail_iput; inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; inode->i_fop = &pipefifo_fops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because "mark_inode_dirty()" will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); simple_inode_init_ts(inode); return inode; fail_iput: iput(inode); fail_inode: return NULL; } int create_pipe_files(struct file **res, int flags) { struct inode *inode = get_pipe_inode(); struct file *f; int error; if (!inode) return -ENFILE; if (flags & O_NOTIFICATION_PIPE) { error = watch_queue_init(inode->i_pipe); if (error) { free_pipe_info(inode->i_pipe); iput(inode); return error; } } f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), &pipefifo_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); return PTR_ERR(f); } f->private_data = inode->i_pipe; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), &pipefifo_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); return PTR_ERR(res[0]); } res[0]->private_data = inode->i_pipe; res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); return 0; } static int __do_pipe_flags(int *fd, struct file **files, int flags) { int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) return -EINVAL; error = create_pipe_files(files, flags); if (error) return error; error = get_unused_fd_flags(flags); if (error < 0) goto err_read_pipe; fdr = error; error = get_unused_fd_flags(flags); if (error < 0) goto err_fdr; fdw = error; audit_fd_pair(fdr, fdw); fd[0] = fdr; fd[1] = fdw; /* pipe groks IOCB_NOWAIT */ files[0]->f_mode |= FMODE_NOWAIT; files[1]->f_mode |= FMODE_NOWAIT; return 0; err_fdr: put_unused_fd(fdr); err_read_pipe: fput(files[0]); fput(files[1]); return error; } int do_pipe_flags(int *fd, int flags) { struct file *files[2]; int error = __do_pipe_flags(fd, files, flags); if (!error) { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } return error; } /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ static int do_pipe2(int __user *fildes, int flags) { struct file *files[2]; int fd[2]; int error; error = __do_pipe_flags(fd, files, flags); if (!error) { if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { fput(files[0]); fput(files[1]); put_unused_fd(fd[0]); put_unused_fd(fd[1]); error = -EFAULT; } else { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } } return error; } SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { return do_pipe2(fildes, flags); } SYSCALL_DEFINE1(pipe, int __user *, fildes) { return do_pipe2(fildes, 0); } /* * This is the stupid "wait for pipe to be readable or writable" * model. * * See pipe_read/write() for the proper kind of exclusive wait, * but that requires that we wake up any other readers/writers * if we then do not end up reading everything (ie the whole * "wake_next_reader/writer" logic in pipe_read/write()). */ void pipe_wait_readable(struct pipe_inode_info *pipe) { pipe_unlock(pipe); wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); pipe_lock(pipe); } void pipe_wait_writable(struct pipe_inode_info *pipe) { pipe_unlock(pipe); wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); pipe_lock(pipe); } /* * This depends on both the wait (here) and the wakeup (wake_up_partner) * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot * race with the count check and waitqueue prep. * * Normally in order to avoid races, you'd do the prepare_to_wait() first, * then check the condition you're waiting for, and only then sleep. But * because of the pipe lock, we can check the condition before being on * the wait queue. * * We use the 'rd_wait' waitqueue for pipe partner waiting. */ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { DEFINE_WAIT(rdwait); int cur = *cnt; while (cur == *cnt) { prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); pipe_unlock(pipe); schedule(); finish_wait(&pipe->rd_wait, &rdwait); pipe_lock(pipe); if (signal_pending(current)) break; } return cur == *cnt ? -ERESTARTSYS : 0; } static void wake_up_partner(struct pipe_inode_info *pipe) { wake_up_interruptible_all(&pipe->rd_wait); } static int fifo_open(struct inode *inode, struct file *filp) { struct pipe_inode_info *pipe; bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_version = 0; spin_lock(&inode->i_lock); if (inode->i_pipe) { pipe = inode->i_pipe; pipe->files++; spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; pipe->files = 1; spin_lock(&inode->i_lock); if (unlikely(inode->i_pipe)) { inode->i_pipe->files++; spin_unlock(&inode->i_lock); free_pipe_info(pipe); pipe = inode->i_pipe; } else { inode->i_pipe = pipe; spin_unlock(&inode->i_lock); } } filp->private_data = pipe; /* OK, we have a pipe and it's pinned down */ __pipe_lock(pipe); /* We can only do regular read/write on fifos */ stream_open(inode, filp); switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { case FMODE_READ: /* * O_RDONLY * POSIX.1 says that O_NONBLOCK means return with the FIFO * opened, even when there is no process writing the FIFO. */ pipe->r_counter++; if (pipe->readers++ == 0) wake_up_partner(pipe); if (!is_pipe && !pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress EPOLLHUP until we have * seen a writer */ filp->f_version = pipe->w_counter; } else { if (wait_for_partner(pipe, &pipe->w_counter)) goto err_rd; } } break; case FMODE_WRITE: /* * O_WRONLY * POSIX.1 says that O_NONBLOCK means return -1 with * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; pipe->w_counter++; if (!pipe->writers++) wake_up_partner(pipe); if (!is_pipe && !pipe->readers) { if (wait_for_partner(pipe, &pipe->r_counter)) goto err_wr; } break; case FMODE_READ | FMODE_WRITE: /* * O_RDWR * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. * This implementation will NEVER block on a O_RDWR open, since * the process can at least talk to itself. */ pipe->readers++; pipe->writers++; pipe->r_counter++; pipe->w_counter++; if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(pipe); break; default: ret = -EINVAL; goto err; } /* Ok! */ __pipe_unlock(pipe); return 0; err_rd: if (!--pipe->readers) wake_up_interruptible(&pipe->wr_wait); ret = -ERESTARTSYS; goto err; err_wr: if (!--pipe->writers) wake_up_interruptible_all(&pipe->rd_wait); ret = -ERESTARTSYS; goto err; err: __pipe_unlock(pipe); put_pipe_info(inode, pipe); return ret; } const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, .splice_write = iter_file_splice_write, }; /* * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ unsigned int round_pipe_size(unsigned int size) { if (size > (1U << 31)) return 0; /* Minimum pipe size, as required by POSIX */ if (size < PAGE_SIZE) return PAGE_SIZE; return roundup_pow_of_two(size); } /* * Resize the pipe ring to a number of slots. * * Note the pipe can be reduced in capacity, but only if the current * occupancy doesn't exceed nr_slots; if it does, EBUSY will be * returned instead. */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) { struct pipe_buffer *bufs; unsigned int head, tail, mask, n; bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; n = pipe_occupancy(head, tail); if (nr_slots < n) { spin_unlock_irq(&pipe->rd_wait.lock); kfree(bufs); return -EBUSY; } /* * The pipe array wraps around, so just start the new one at zero * and adjust the indices. */ if (n > 0) { unsigned int h = head & mask; unsigned int t = tail & mask; if (h > t) { memcpy(bufs, pipe->bufs + t, n * sizeof(struct pipe_buffer)); } else { unsigned int tsize = pipe->ring_size - t; if (h > 0) memcpy(bufs + tsize, pipe->bufs, h * sizeof(struct pipe_buffer)); memcpy(bufs, pipe->bufs + t, tsize * sizeof(struct pipe_buffer)); } } head = n; tail = 0; kfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; spin_unlock_irq(&pipe->rd_wait.lock); /* This might have made more room for writers */ wake_up_interruptible(&pipe->wr_wait); return 0; } /* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) { unsigned long user_bufs; unsigned int nr_slots, size; long ret = 0; if (pipe_has_watch_queue(pipe)) return -EBUSY; size = round_pipe_size(arg); nr_slots = size >> PAGE_SHIFT; if (!nr_slots) return -EINVAL; /* * If trying to increase the pipe capacity, check that an * unprivileged user is not trying to exceed various limits * (soft limit check here, hard limit check just below). * Decreasing the pipe capacity is always permitted, even * if the user is currently over a limit. */ if (nr_slots > pipe->max_usage && size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) return -EPERM; user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); if (nr_slots > pipe->max_usage && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && pipe_is_unprivileged_user()) { ret = -EPERM; goto out_revert_acct; } ret = pipe_resize_ring(pipe, nr_slots); if (ret < 0) goto out_revert_acct; pipe->max_usage = nr_slots; pipe->nr_accounted = nr_slots; return pipe->max_usage * PAGE_SIZE; out_revert_acct: (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); return ret; } /* * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is * not enough to verify that this is a pipe. */ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; if (file->f_op != &pipefifo_fops || !pipe) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; return pipe; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { struct pipe_inode_info *pipe; long ret; pipe = get_pipe_info(file, false); if (!pipe) return -EBADF; __pipe_lock(pipe); switch (cmd) { case F_SETPIPE_SZ: ret = pipe_set_size(pipe, arg); break; case F_GETPIPE_SZ: ret = pipe->max_usage * PAGE_SIZE; break; default: ret = -EINVAL; break; } __pipe_unlock(pipe); return ret; } static const struct super_operations pipefs_ops = { .destroy_inode = free_inode_nonrcu, .statfs = simple_statfs, }; /* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ static int pipefs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &pipefs_ops; ctx->dops = &pipefs_dentry_operations; return 0; } static struct file_system_type pipe_fs_type = { .name = "pipefs", .init_fs_context = pipefs_init_fs_context, .kill_sb = kill_anon_super, }; #ifdef CONFIG_SYSCTL static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { if (write) { unsigned int val; val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; *valp = val; } else { unsigned int val = *valp; *lvalp = (unsigned long) val; } return 0; } static int proc_dopipe_max_size(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); } static struct ctl_table fs_pipe_sysctls[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = proc_dopipe_max_size, }, { .procname = "pipe-user-pages-hard", .data = &pipe_user_pages_hard, .maxlen = sizeof(pipe_user_pages_hard), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "pipe-user-pages-soft", .data = &pipe_user_pages_soft, .maxlen = sizeof(pipe_user_pages_soft), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { } }; #endif static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { err = PTR_ERR(pipe_mnt); unregister_filesystem(&pipe_fs_type); } } #ifdef CONFIG_SYSCTL register_sysctl_init("fs", fs_pipe_sysctls); #endif return err; } fs_initcall(init_pipe_fs);
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix * Copyright (C) 2006 Andrey Volkov, Varma Electronics * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com> */ #include <linux/can/dev.h> #include <linux/module.h> #define MOD_DESC "CAN device driver interface" MODULE_DESCRIPTION(MOD_DESC); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>"); /* Local echo of CAN messages * * CAN network devices *should* support a local echo functionality * (see Documentation/networking/can.rst). To test the handling of CAN * interfaces that do not support the local echo both driver types are * implemented. In the case that the driver does not support the echo * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core * to perform the echo as a fallback solution. */ void can_flush_echo_skb(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; int i; for (i = 0; i < priv->echo_skb_max; i++) { if (priv->echo_skb[i]) { kfree_skb(priv->echo_skb[i]); priv->echo_skb[i] = NULL; stats->tx_dropped++; stats->tx_aborted_errors++; } } } /* Put the skb on the stack to be looped backed locally lateron * * The function is typically called in the start_xmit function * of the device driver. The driver must protect access to * priv->echo_skb, if necessary. */ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return -EINVAL; } /* check flag whether this packet has to be looped back */ if (!(dev->flags & IFF_ECHO) || (skb->protocol != htons(ETH_P_CAN) && skb->protocol != htons(ETH_P_CANFD) && skb->protocol != htons(ETH_P_CANXL))) { kfree_skb(skb); return 0; } if (!priv->echo_skb[idx]) { skb = can_create_echo_skb(skb); if (!skb) return -ENOMEM; /* make settings for echo to reduce code in irq context */ skb->ip_summed = CHECKSUM_UNNECESSARY; skb->dev = dev; /* save frame_len to reuse it when transmission is completed */ can_skb_prv(skb)->frame_len = frame_len; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; skb_tx_timestamp(skb); /* save this skb for tx interrupt echo handling */ priv->echo_skb[idx] = skb; } else { /* locking problem with netif_stop_queue() ?? */ netdev_err(dev, "%s: BUG! echo_skb %d is occupied!\n", __func__, idx); kfree_skb(skb); return -EBUSY; } return 0; } EXPORT_SYMBOL_GPL(can_put_echo_skb); struct sk_buff * __can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *len_ptr, unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return NULL; } if (priv->echo_skb[idx]) { /* Using "struct canfd_frame::len" for the frame * length is supported on both CAN and CANFD frames. */ struct sk_buff *skb = priv->echo_skb[idx]; struct can_skb_priv *can_skb_priv = can_skb_prv(skb); if (skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) skb_tstamp_tx(skb, skb_hwtstamps(skb)); /* get the real payload length for netdev statistics */ *len_ptr = can_skb_get_data_len(skb); if (frame_len_ptr) *frame_len_ptr = can_skb_priv->frame_len; priv->echo_skb[idx] = NULL; if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_BROADCAST; } else { dev_consume_skb_any(skb); return NULL; } return skb; } return NULL; } /* Get the skb from the stack and loop it back locally * * The function is typically called when the TX done interrupt * is handled in the device driver. The driver must protect * access to priv->echo_skb, if necessary. */ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr) { struct sk_buff *skb; unsigned int len; skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr); if (!skb) return 0; skb_get(skb); if (netif_rx(skb) == NET_RX_SUCCESS) dev_consume_skb_any(skb); else dev_kfree_skb_any(skb); return len; } EXPORT_SYMBOL_GPL(can_get_echo_skb); /* Remove the skb from the stack and free it. * * The function is typically called when TX failed. */ void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return; } if (priv->echo_skb[idx]) { struct sk_buff *skb = priv->echo_skb[idx]; struct can_skb_priv *can_skb_priv = can_skb_prv(skb); if (frame_len_ptr) *frame_len_ptr = can_skb_priv->frame_len; dev_kfree_skb_any(skb); priv->echo_skb[idx] = NULL; } } EXPORT_SYMBOL_GPL(can_free_echo_skb); /* fill common values for CAN sk_buffs */ static void init_can_skb_reserve(struct sk_buff *skb) { skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); can_skb_reserve(skb); can_skb_prv(skb)->skbcnt = 0; } struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) { struct sk_buff *skb; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + sizeof(struct can_frame)); if (unlikely(!skb)) { *cf = NULL; return NULL; } skb->protocol = htons(ETH_P_CAN); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cf = skb_put_zero(skb, sizeof(struct can_frame)); return skb; } EXPORT_SYMBOL_GPL(alloc_can_skb); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd) { struct sk_buff *skb; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + sizeof(struct canfd_frame)); if (unlikely(!skb)) { *cfd = NULL; return NULL; } skb->protocol = htons(ETH_P_CANFD); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cfd = skb_put_zero(skb, sizeof(struct canfd_frame)); /* set CAN FD flag by default */ (*cfd)->flags = CANFD_FDF; return skb; } EXPORT_SYMBOL_GPL(alloc_canfd_skb); struct sk_buff *alloc_canxl_skb(struct net_device *dev, struct canxl_frame **cxl, unsigned int data_len) { struct sk_buff *skb; if (data_len < CANXL_MIN_DLEN || data_len > CANXL_MAX_DLEN) goto out_error; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + CANXL_HDR_SIZE + data_len); if (unlikely(!skb)) goto out_error; skb->protocol = htons(ETH_P_CANXL); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cxl = skb_put_zero(skb, CANXL_HDR_SIZE + data_len); /* set CAN XL flag and length information by default */ (*cxl)->flags = CANXL_XLF; (*cxl)->len = data_len; return skb; out_error: *cxl = NULL; return NULL; } EXPORT_SYMBOL_GPL(alloc_canxl_skb); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf) { struct sk_buff *skb; skb = alloc_can_skb(dev, cf); if (unlikely(!skb)) return NULL; (*cf)->can_id = CAN_ERR_FLAG; (*cf)->len = CAN_ERR_DLC; return skb; } EXPORT_SYMBOL_GPL(alloc_can_err_skb); /* Check for outgoing skbs that have not been created by the CAN subsystem */ static bool can_skb_headroom_valid(struct net_device *dev, struct sk_buff *skb) { /* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */ if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv))) return false; /* af_packet does not apply CAN skb specific settings */ if (skb->ip_summed == CHECKSUM_NONE) { /* init headroom */ can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->ip_summed = CHECKSUM_UNNECESSARY; /* perform proper loopback on capable devices */ if (dev->flags & IFF_ECHO) skb->pkt_type = PACKET_LOOPBACK; else skb->pkt_type = PACKET_HOST; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); /* set CANFD_FDF flag for CAN FD frames */ if (can_is_canfd_skb(skb)) { struct canfd_frame *cfd; cfd = (struct canfd_frame *)skb->data; cfd->flags |= CANFD_FDF; } } return true; } /* Drop a given socketbuffer if it does not contain a valid CAN frame. */ bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb) { switch (ntohs(skb->protocol)) { case ETH_P_CAN: if (!can_is_can_skb(skb)) goto inval_skb; break; case ETH_P_CANFD: if (!can_is_canfd_skb(skb)) goto inval_skb; break; case ETH_P_CANXL: if (!can_is_canxl_skb(skb)) goto inval_skb; break; default: goto inval_skb; } if (!can_skb_headroom_valid(dev, skb)) goto inval_skb; return false; inval_skb: kfree_skb(skb); dev->stats.tx_dropped++; return true; } EXPORT_SYMBOL_GPL(can_dropped_invalid_skb);
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "hci_request.h" #include "mgmt_util.h" #include "msft.h" #define MSFT_RSSI_THRESHOLD_VALUE_MIN -127 #define MSFT_RSSI_THRESHOLD_VALUE_MAX 20 #define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C #define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 struct msft_cp_read_supported_features { __u8 sub_opcode; } __packed; struct msft_rp_read_supported_features { __u8 status; __u8 sub_opcode; __le64 features; __u8 evt_prefix_len; __u8 evt_prefix[]; } __packed; #define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03 #define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01 struct msft_le_monitor_advertisement_pattern { __u8 length; __u8 data_type; __u8 start_byte; __u8 pattern[]; }; struct msft_le_monitor_advertisement_pattern_data { __u8 count; __u8 data[]; }; struct msft_cp_le_monitor_advertisement { __u8 sub_opcode; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; __u8 data[]; } __packed; struct msft_rp_le_monitor_advertisement { __u8 status; __u8 sub_opcode; __u8 handle; } __packed; #define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04 struct msft_cp_le_cancel_monitor_advertisement { __u8 sub_opcode; __u8 handle; } __packed; struct msft_rp_le_cancel_monitor_advertisement { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE 0x05 struct msft_cp_le_set_advertisement_filter_enable { __u8 sub_opcode; __u8 enable; } __packed; struct msft_rp_le_set_advertisement_filter_enable { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_EV_LE_MONITOR_DEVICE 0x02 struct msft_ev_le_monitor_device { __u8 addr_type; bdaddr_t bdaddr; __u8 monitor_handle; __u8 monitor_state; } __packed; struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; struct list_head list; }; enum monitor_addr_filter_state { AF_STATE_IDLE, AF_STATE_ADDING, AF_STATE_ADDED, AF_STATE_REMOVING, }; #define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04 struct msft_monitor_addr_filter_data { __u8 msft_handle; __u8 pattern_handle; /* address filters pertain to */ __u16 mgmt_handle; int state; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 addr_type; bdaddr_t bdaddr; struct list_head list; }; struct msft_data { __u64 features; __u8 evt_prefix_len; __u8 *evt_prefix; struct list_head handle_map; struct list_head address_filters; __u8 resuming; __u8 suspending; __u8 filter_enabled; /* To synchronize add/remove address filter and monitor device event.*/ struct mutex filter_lock; }; bool msft_monitor_supported(struct hci_dev *hdev) { return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR); } static bool read_supported_features(struct hci_dev *hdev, struct msft_data *msft) { struct msft_cp_read_supported_features cp; struct msft_rp_read_supported_features *rp; struct sk_buff *skb; cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", PTR_ERR(skb)); return false; } if (skb->len < sizeof(*rp)) { bt_dev_err(hdev, "MSFT supported features length mismatch"); goto failed; } rp = (struct msft_rp_read_supported_features *)skb->data; if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES) goto failed; if (rp->evt_prefix_len > 0) { msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len, GFP_KERNEL); if (!msft->evt_prefix) goto failed; } msft->evt_prefix_len = rp->evt_prefix_len; msft->features = __le64_to_cpu(rp->features); if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY) hdev->msft_curve_validity = true; kfree_skb(skb); return true; failed: kfree_skb(skb); return false; } /* is_mgmt = true matches the handle exposed to userspace via mgmt. * is_mgmt = false matches the handle used by the msft controller. * This function requires the caller holds hdev->lock */ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data (struct hci_dev *hdev, u16 handle, bool is_mgmt) { struct msft_monitor_advertisement_handle_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->handle_map, list) { if (is_mgmt && entry->mgmt_handle == handle) return entry; if (!is_mgmt && entry->msft_handle == handle) return entry; } return NULL; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_find_address_data (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr, u8 pattern_handle) { struct msft_monitor_addr_filter_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->address_filters, list) { if (entry->pattern_handle == pattern_handle && addr_type == entry->addr_type && !bacmp(addr, &entry->bdaddr)) return entry; } return NULL; } /* This function requires the caller holds hdev->lock */ static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle, bdaddr_t *bdaddr, __u8 addr_type, bool notify) { struct monitored_device *dev, *tmp; int count = 0; list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) { /* mgmt_handle == 0 indicates remove all devices, whereas, * bdaddr == NULL indicates remove all devices matching the * mgmt_handle. */ if ((!mgmt_handle || dev->handle == mgmt_handle) && (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) && addr_type == dev->addr_type))) { if (notify && dev->notified) { mgmt_adv_monitor_device_lost(hdev, dev->handle, &dev->bdaddr, dev->addr_type); } list_del(&dev->list); kfree(dev); count++; } } return count; } static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; hci_dev_lock(hdev); rp = (struct msft_rp_le_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } status = rp->status; if (status) goto unlock; handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL); if (!handle_data) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } handle_data->mgmt_handle = monitor->handle; handle_data->msft_handle = rp->handle; handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; INIT_LIST_HEAD(&handle_data->list); list_add(&handle_data->list, &msft->handle_map); monitor->state = ADV_MONITOR_STATE_OFFLOADED; unlock: if (status) hci_free_adv_monitor(hdev, monitor); hci_dev_unlock(hdev); return status; } /* This function requires the caller holds hci_req_sync_lock */ static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle) { struct msft_monitor_addr_filter_data *address_filter, *n; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct list_head head; struct sk_buff *skb; INIT_LIST_HEAD(&head); /* Cancel all corresponding address monitors */ mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { if (address_filter->pattern_handle != handle) continue; list_del(&address_filter->list); /* Keep the address filter and let * msft_add_address_filter_sync() remove and free the address * filter. */ if (address_filter->state == AF_STATE_ADDING) { address_filter->state = AF_STATE_REMOVING; continue; } /* Keep the address filter and let * msft_cancel_address_filter_sync() remove and free the address * filter */ if (address_filter->state == AF_STATE_REMOVING) continue; list_add_tail(&address_filter->list, &head); } mutex_unlock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &head, list) { list_del(&address_filter->list); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { kfree(address_filter); continue; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); kfree(address_filter); } } static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_cancel_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; u8 msft_handle; rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto done; } status = rp->status; if (status) goto done; hci_dev_lock(hdev); handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (handle_data) { if (monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; /* Do not free the monitor if it is being removed due to * suspend. It will be re-monitored on resume. */ if (!msft->suspending) { hci_free_adv_monitor(hdev, monitor); /* Clear any monitored devices by this Adv Monitor */ msft_monitor_device_del(hdev, handle_data->mgmt_handle, NULL, 0, false); } msft_handle = handle_data->msft_handle; list_del(&handle_data->list); kfree(handle_data); hci_dev_unlock(hdev); msft_remove_addr_filters_sync(hdev, msft_handle); } else { hci_dev_unlock(hdev); } done: return status; } /* This function requires the caller holds hci_req_sync_lock */ static int msft_remove_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_monitor_advertisement_handle_data *handle_data; struct sk_buff *skb; handle_data = msft_find_handle_data(hdev, monitor->handle, true); /* If no matched handle, just remove without telling controller */ if (!handle_data) return -ENOENT; cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = handle_data->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) return PTR_ERR(skb); return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); } /* This function requires the caller holds hci_req_sync_lock */ int msft_suspend_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct adv_monitor *monitor; int handle = 0; if (!msft || !msft_monitor_supported(hdev)) return 0; msft->suspending = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_remove_monitor_sync(hdev, monitor); handle++; } /* All monitors have been removed */ msft->suspending = false; return 0; } static bool msft_monitor_rssi_valid(struct adv_monitor *monitor) { struct adv_rssi_thresholds *r = &monitor->rssi; if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX || r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX) return false; /* High_threshold_timeout is not supported, * once high_threshold is reached, events are immediately reported. */ if (r->high_threshold_timeout != 0) return false; if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX) return false; /* Sampling period from 0x00 to 0xFF are all allowed */ return true; } static bool msft_monitor_pattern_valid(struct adv_monitor *monitor) { return msft_monitor_rssi_valid(monitor); /* No additional check needed for pattern-based monitor */ } static int msft_add_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_monitor_advertisement *cp; struct msft_le_monitor_advertisement_pattern_data *pattern_data; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_le_monitor_advertisement_pattern *pattern; struct adv_pattern *entry; size_t total_size = sizeof(*cp) + sizeof(*pattern_data); ptrdiff_t offset = 0; u8 pattern_count = 0; struct sk_buff *skb; int err; if (!msft_monitor_pattern_valid(monitor)) return -EINVAL; list_for_each_entry(entry, &monitor->patterns, list) { pattern_count++; total_size += sizeof(*pattern) + entry->length; } cp = kmalloc(total_size, GFP_KERNEL); if (!cp) return -ENOMEM; cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = monitor->rssi.high_threshold; cp->rssi_low = monitor->rssi.low_threshold; cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout; cp->rssi_sampling_period = monitor->rssi.sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; pattern_data = (void *)cp->data; pattern_data->count = pattern_count; list_for_each_entry(entry, &monitor->patterns, list) { pattern = (void *)(pattern_data->data + offset); /* the length also includes data_type and offset */ pattern->length = entry->length + 2; pattern->data_type = entry->ad_type; pattern->start_byte = entry->offset; memcpy(pattern->pattern, entry->value, entry->length); offset += sizeof(*pattern) + entry->length; } skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto out_free; } err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); if (err) goto out_free; handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (!handle_data) { err = -ENODATA; goto out_free; } handle_data->rssi_high = cp->rssi_high; handle_data->rssi_low = cp->rssi_low; handle_data->rssi_low_interval = cp->rssi_low_interval; handle_data->rssi_sampling_period = cp->rssi_sampling_period; out_free: kfree(cp); return err; } /* This function requires the caller holds hci_req_sync_lock */ static void reregister_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; struct msft_data *msft = hdev->msft_data; int handle = 0; if (!msft) return; msft->resuming = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_add_monitor_sync(hdev, monitor); handle++; } /* All monitors have been reregistered */ msft->resuming = false; } /* This function requires the caller holds hci_req_sync_lock */ int msft_resume_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft || !msft_monitor_supported(hdev)) return 0; hci_dev_lock(hdev); /* Clear already tracked devices on resume. Once the monitors are * reregistered, devices in range will be found again after resume. */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); reregister_monitor(hdev); return 0; } /* This function requires the caller holds hci_req_sync_lock */ void msft_do_open(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (hdev->msft_opcode == HCI_OP_NOP) return; if (!msft) { bt_dev_err(hdev, "MSFT extension not registered"); return; } bt_dev_dbg(hdev, "Initialize MSFT extension"); /* Reset existing MSFT data before re-reading */ kfree(msft->evt_prefix); msft->evt_prefix = NULL; msft->evt_prefix_len = 0; msft->features = 0; if (!read_supported_features(hdev, msft)) { hdev->msft_data = NULL; kfree(msft); return; } if (msft_monitor_supported(hdev)) { msft->resuming = true; msft_set_filter_enable(hdev, true); /* Monitors get removed on power off, so we need to explicitly * tell the controller to re-monitor. */ reregister_monitor(hdev); } } void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct msft_monitor_advertisement_handle_data *handle_data, *tmp; struct msft_monitor_addr_filter_data *address_filter, *n; struct adv_monitor *monitor; if (!msft) return; bt_dev_dbg(hdev, "Cleanup of MSFT extension"); /* The controller will silently remove all monitors on power off. * Therefore, remove handle_data mapping and reset monitor state. */ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; list_del(&handle_data->list); kfree(handle_data); } mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { list_del(&address_filter->list); kfree(address_filter); } mutex_unlock(&msft->filter_lock); hci_dev_lock(hdev); /* Clear any devices that are being monitored and notify device lost */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); } static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb; int err = 0; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return 0; mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter", &address_filter->bdaddr); err = PTR_ERR(skb); goto done; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); done: kfree(address_filter); return err; } void msft_register(struct hci_dev *hdev) { struct msft_data *msft = NULL; bt_dev_dbg(hdev, "Register MSFT extension"); msft = kzalloc(sizeof(*msft), GFP_KERNEL); if (!msft) { bt_dev_err(hdev, "Failed to register MSFT extension"); return; } INIT_LIST_HEAD(&msft->handle_map); INIT_LIST_HEAD(&msft->address_filters); hdev->msft_data = msft; mutex_init(&msft->filter_lock); } void msft_unregister(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft) return; bt_dev_dbg(hdev, "Unregister MSFT extension"); hdev->msft_data = NULL; kfree(msft->evt_prefix); mutex_destroy(&msft->filter_lock); kfree(msft); } /* This function requires the caller holds hdev->lock */ static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { struct monitored_device *dev; dev = kmalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { bt_dev_err(hdev, "MSFT vendor event %u: no memory", MSFT_EV_LE_MONITOR_DEVICE); return; } bacpy(&dev->bdaddr, bdaddr); dev->addr_type = addr_type; dev->handle = mgmt_handle; dev->notified = false; INIT_LIST_HEAD(&dev->list); list_add(&dev->list, &hdev->monitored_devices); hdev->advmon_pend_notify = true; } /* This function requires the caller holds hdev->lock */ static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type, true)) { bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list", MSFT_EV_LE_MONITOR_DEVICE, bdaddr); } } static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, u8 ev, size_t len) { void *data; data = skb_pull_data(skb, len); if (!data) bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev); return data; } static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_rp_le_monitor_advertisement *rp; struct msft_cp_le_monitor_advertisement *cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb = NULL; bool remove = false; size_t size; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return -ENODEV; /* We are safe to use the address filter from now on. * msft_monitor_device_evt() wouldn't delete this filter because it's * not been added by now. * And all other functions that requiring hci_req_sync_lock wouldn't * touch this filter before this func completes because it's protected * by hci_req_sync_lock. */ if (address_filter->state == AF_STATE_REMOVING) { mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); kfree(address_filter); return 0; } size = sizeof(*cp) + sizeof(address_filter->addr_type) + sizeof(address_filter->bdaddr); cp = kzalloc(size, GFP_KERNEL); if (!cp) { bt_dev_err(hdev, "MSFT: Alloc cmd param err"); remove = true; goto done; } cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = address_filter->rssi_high; cp->rssi_low = address_filter->rssi_low; cp->rssi_low_interval = address_filter->rssi_low_interval; cp->rssi_sampling_period = address_filter->rssi_sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR; cp->data[0] = address_filter->addr_type; memcpy(&cp->data[1], &address_filter->bdaddr, sizeof(address_filter->bdaddr)); skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to enable address %pMR filter", &address_filter->bdaddr); skb = NULL; remove = true; goto done; } rp = skb_pull_data(skb, sizeof(*rp)); if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT || rp->status) remove = true; done: mutex_lock(&msft->filter_lock); if (remove) { bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter", &address_filter->bdaddr); list_del(&address_filter->list); kfree(address_filter); } else { address_filter->state = AF_STATE_ADDED; address_filter->msft_handle = rp->handle; bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled", &address_filter->bdaddr); } mutex_unlock(&msft->filter_lock); kfree_skb(skb); return 0; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_add_address_filter (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr, struct msft_monitor_advertisement_handle_data *handle_data) { struct msft_monitor_addr_filter_data *address_filter = NULL; struct msft_data *msft = hdev->msft_data; int err; address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL); if (!address_filter) return NULL; address_filter->state = AF_STATE_ADDING; address_filter->msft_handle = 0xff; address_filter->pattern_handle = handle_data->msft_handle; address_filter->mgmt_handle = handle_data->mgmt_handle; address_filter->rssi_high = handle_data->rssi_high; address_filter->rssi_low = handle_data->rssi_low; address_filter->rssi_low_interval = handle_data->rssi_low_interval; address_filter->rssi_sampling_period = handle_data->rssi_sampling_period; address_filter->addr_type = addr_type; bacpy(&address_filter->bdaddr, bdaddr); /* With the above AF_STATE_ADDING, duplicated address filter can be * avoided when receiving monitor device event (found/lost) frequently * for the same device. */ list_add_tail(&address_filter->list, &msft->address_filters); err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync, address_filter, NULL); if (err < 0) { bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr); list_del(&address_filter->list); kfree(address_filter); return NULL; } bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter", &address_filter->bdaddr); return address_filter; } /* This function requires the caller holds hdev->lock */ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct msft_monitor_addr_filter_data *n, *address_filter = NULL; struct msft_ev_le_monitor_device *ev; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; u16 mgmt_handle = 0xffff; u8 addr_type; ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev)); if (!ev) return; bt_dev_dbg(hdev, "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR", MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle, ev->monitor_state, &ev->bdaddr); handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); if (!test_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks)) { if (!handle_data) return; mgmt_handle = handle_data->mgmt_handle; goto report_state; } if (handle_data) { /* Don't report any device found/lost event from pattern * monitors. Pattern monitor always has its address filters for * tracking devices. */ address_filter = msft_find_address_data(hdev, ev->addr_type, &ev->bdaddr, handle_data->msft_handle); if (address_filter) return; if (ev->monitor_state && handle_data->cond_type == MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN) msft_add_address_filter(hdev, ev->addr_type, &ev->bdaddr, handle_data); return; } /* This device event is not from pattern monitor. * Report it if there is a corresponding address_filter for it. */ list_for_each_entry(n, &msft->address_filters, list) { if (n->state == AF_STATE_ADDED && n->msft_handle == ev->monitor_handle) { mgmt_handle = n->mgmt_handle; address_filter = n; break; } } if (!address_filter) { bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u", &ev->bdaddr, ev->monitor_handle, ev->monitor_state); return; } report_state: switch (ev->addr_type) { case ADDR_LE_DEV_PUBLIC: addr_type = BDADDR_LE_PUBLIC; break; case ADDR_LE_DEV_RANDOM: addr_type = BDADDR_LE_RANDOM; break; default: bt_dev_err(hdev, "MSFT vendor event 0x%02x: unknown addr type 0x%02x", MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type); return; } if (ev->monitor_state) { msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle); } else { if (address_filter && address_filter->state == AF_STATE_ADDED) { address_filter->state = AF_STATE_REMOVING; hci_cmd_sync_queue(hdev, msft_cancel_address_filter_sync, address_filter, NULL); } msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle); } } void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct msft_data *msft = hdev->msft_data; u8 *evt_prefix; u8 *evt; if (!msft) return; /* When the extension has defined an event prefix, check that it * matches, and otherwise just return. */ if (msft->evt_prefix_len > 0) { evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len); if (!evt_prefix) return; if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len)) return; } /* Every event starts at least with an event code and the rest of * the data is variable and depends on the event code. */ if (skb->len < 1) return; evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt)); if (!evt) return; hci_dev_lock(hdev); switch (*evt) { case MSFT_EV_LE_MONITOR_DEVICE: mutex_lock(&msft->filter_lock); msft_monitor_device_evt(hdev, skb); mutex_unlock(&msft->filter_lock); break; default: bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt); break; } hci_dev_unlock(hdev); } __u64 msft_get_features(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; return msft ? msft->features : 0; } static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev, void *user_data, u8 status) { struct msft_cp_le_set_advertisement_filter_enable *cp = user_data; struct msft_data *msft = hdev->msft_data; /* Error 0x0C would be returned if the filter enabled status is * already set to whatever we were trying to set. * Although the default state should be disabled, some controller set * the initial value to enabled. Because there is no way to know the * actual initial value before sending this command, here we also treat * error 0x0C as success. */ if (status != 0x00 && status != 0x0C) return; hci_dev_lock(hdev); msft->filter_enabled = cp->enable; if (status == 0x0C) bt_dev_warn(hdev, "MSFT filter_enable is already %s", cp->enable ? "on" : "off"); hci_dev_unlock(hdev); } /* This function requires the caller holds hci_req_sync_lock */ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_add_monitor_sync(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_remove_monitor_sync(hdev, monitor); } int msft_set_filter_enable(struct hci_dev *hdev, bool enable) { struct msft_cp_le_set_advertisement_filter_enable cp; struct msft_data *msft = hdev->msft_data; int err; if (!msft) return -EOPNOTSUPP; cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE; cp.enable = enable; err = __hci_cmd_sync_status(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); msft_le_set_advertisement_filter_enable_cb(hdev, &cp, err); return 0; } bool msft_curve_validity(struct hci_dev *hdev) { return hdev->msft_curve_validity; }
6697 6684 3933 6698 6807 6717 3368 6810 2669 2669 1323 130 106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 // SPDX-License-Identifier: GPL-2.0-only /* * lib/bitmap.c * Helper functions for bitmap.h. */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/ctype.h> #include <linux/device.h> #include <linux/export.h> #include <linux/slab.h> /** * DOC: bitmap introduction * * bitmaps provide an array of bits, implemented using an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of * BITS_PER_LONG. * * The possible unused bits in the last, partially used word * of a bitmap are 'don't care'. The implementation makes * no particular effort to keep them zero. It ensures that * their value will not affect the results of any operation. * The bitmap operations that return Boolean (bitmap_empty, * for example) or scalar (bitmap_weight, for example) results * carefully filter out these unused bits from impacting their * results. * * The byte ordering of bitmaps is more natural on little * endian architectures. See the big-endian headers * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h * for the best explanations of this ordering. */ bool __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return false; if (bits % BITS_PER_LONG) if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return false; return true; } EXPORT_SYMBOL(__bitmap_equal); bool __bitmap_or_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, const unsigned long *bitmap3, unsigned int bits) { unsigned int k, lim = bits / BITS_PER_LONG; unsigned long tmp; for (k = 0; k < lim; ++k) { if ((bitmap1[k] | bitmap2[k]) != bitmap3[k]) return false; } if (!(bits % BITS_PER_LONG)) return true; tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k]; return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0; } void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { unsigned int k, lim = BITS_TO_LONGS(bits); for (k = 0; k < lim; ++k) dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); /** * __bitmap_shift_right - logical right shift of the bits in a bitmap * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits * @nbits : bitmap size, in bits * * Shifting right (dividing) means moving bits in the MS -> LS bit * direction. Zeros are fed into the vacated MS positions and the * LS bits shifted off the bottom are lost. */ void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, unsigned shift, unsigned nbits) { unsigned k, lim = BITS_TO_LONGS(nbits); unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; unsigned long mask = BITMAP_LAST_WORD_MASK(nbits); for (k = 0; off + k < lim; ++k) { unsigned long upper, lower; /* * If shift is not word aligned, take lower rem bits of * word above and make them the top rem bits of result. */ if (!rem || off + k + 1 >= lim) upper = 0; else { upper = src[off + k + 1]; if (off + k + 1 == lim - 1) upper &= mask; upper <<= (BITS_PER_LONG - rem); } lower = src[off + k]; if (off + k == lim - 1) lower &= mask; lower >>= rem; dst[k] = lower | upper; } if (off) memset(&dst[lim - off], 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_right); /** * __bitmap_shift_left - logical left shift of the bits in a bitmap * @dst : destination bitmap * @src : source bitmap * @shift : shift by this many bits * @nbits : bitmap size, in bits * * Shifting left (multiplying) means moving bits in the LS -> MS * direction. Zeros are fed into the vacated LS bit positions * and those MS bits shifted off the top are lost. */ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, unsigned int shift, unsigned int nbits) { int k; unsigned int lim = BITS_TO_LONGS(nbits); unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; for (k = lim - off - 1; k >= 0; --k) { unsigned long upper, lower; /* * If shift is not word aligned, take upper rem bits of * word below and make them the bottom rem bits of result. */ if (rem && k > 0) lower = src[k - 1] >> (BITS_PER_LONG - rem); else lower = 0; upper = src[k] << rem; dst[k + off] = lower | upper; } if (off) memset(dst, 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_left); /** * bitmap_cut() - remove bit region from bitmap and right shift remaining bits * @dst: destination bitmap, might overlap with src * @src: source bitmap * @first: start bit of region to be removed * @cut: number of bits to remove * @nbits: bitmap size, in bits * * Set the n-th bit of @dst iff the n-th bit of @src is set and * n is less than @first, or the m-th bit of @src is set for any * m such that @first <= n < nbits, and m = n + @cut. * * In pictures, example for a big-endian 32-bit architecture: * * The @src bitmap is:: * * 31 63 * | | * 10000000 11000001 11110010 00010101 10000000 11000001 01110010 00010101 * | | | | * 16 14 0 32 * * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is:: * * 31 63 * | | * 10110000 00011000 00110010 00010101 00010000 00011000 00101110 01000010 * | | | * 14 (bit 17 0 32 * from @src) * * Note that @dst and @src might overlap partially or entirely. * * This is implemented in the obvious way, with a shift and carry * step for each moved bit. Optimisation is left as an exercise * for the compiler. */ void bitmap_cut(unsigned long *dst, const unsigned long *src, unsigned int first, unsigned int cut, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits); unsigned long keep = 0, carry; int i; if (first % BITS_PER_LONG) { keep = src[first / BITS_PER_LONG] & (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG)); } memmove(dst, src, len * sizeof(*dst)); while (cut--) { for (i = first / BITS_PER_LONG; i < len; i++) { if (i < len - 1) carry = dst[i + 1] & 1UL; else carry = 0; dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1)); } } dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG); dst[first / BITS_PER_LONG] |= keep; } EXPORT_SYMBOL(bitmap_cut); bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & bitmap2[k]); if (bits % BITS_PER_LONG) result |= (dst[k] = bitmap1[k] & bitmap2[k] & BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; } EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; } EXPORT_SYMBOL(__bitmap_xor); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k; unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); if (bits % BITS_PER_LONG) result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); void __bitmap_replace(unsigned long *dst, const unsigned long *old, const unsigned long *new, const unsigned long *mask, unsigned int nbits) { unsigned int k; unsigned int nr = BITS_TO_LONGS(nbits); for (k = 0; k < nr; k++) dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]); } EXPORT_SYMBOL(__bitmap_replace); bool __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return true; if (bits % BITS_PER_LONG) if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return true; return false; } EXPORT_SYMBOL(__bitmap_intersects); bool __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return false; if (bits % BITS_PER_LONG) if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return false; return true; } EXPORT_SYMBOL(__bitmap_subset); #define BITMAP_WEIGHT(FETCH, bits) \ ({ \ unsigned int __bits = (bits), idx, w = 0; \ \ for (idx = 0; idx < __bits / BITS_PER_LONG; idx++) \ w += hweight_long(FETCH); \ \ if (__bits % BITS_PER_LONG) \ w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits)); \ \ w; \ }) unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { return BITMAP_WEIGHT(bitmap[idx], bits); } EXPORT_SYMBOL(__bitmap_weight); unsigned int __bitmap_weight_and(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits) { return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits); } EXPORT_SYMBOL(__bitmap_weight_and); void __bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); while (len - bits_to_set >= 0) { *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } if (len) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } } EXPORT_SYMBOL(__bitmap_set); void __bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } if (len) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } } EXPORT_SYMBOL(__bitmap_clear); /** * bitmap_find_next_zero_area_off - find a contiguous aligned zero area * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at * @nr: The number of zeroed bits we're looking for * @align_mask: Alignment mask for zero area * @align_offset: Alignment offset for zero area. * * The @align_mask should be one less than a power of 2; the effect is that * the bit offset of all zero areas this function finds plus @align_offset * is multiple of that power of 2. */ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long align_mask, unsigned long align_offset) { unsigned long index, end, i; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { start = i + 1; goto again; } return index; } EXPORT_SYMBOL(bitmap_find_next_zero_area_off); /** * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap * @pos: a bit position in @buf (0 <= @pos < @nbits) * @nbits: number of valid bit positions in @buf * * Map the bit at position @pos in @buf (of length @nbits) to the * ordinal of which set bit it is. If it is not set or if @pos * is not a valid bit position, map to -1. * * If for example, just bits 4 through 7 are set in @buf, then @pos * values 4 through 7 will get mapped to 0 through 3, respectively, * and other @pos values will get mapped to -1. When @pos value 7 * gets mapped to (returns) @ord value 3 in this example, that means * that bit 7 is the 3rd (starting with 0th) set bit in @buf. * * The bit positions 0 through @bits are valid positions in @buf. */ static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits) { if (pos >= nbits || !test_bit(pos, buf)) return -1; return bitmap_weight(buf, pos); } /** * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap * @dst: remapped result * @src: subset to be remapped * @old: defines domain of map * @new: defines range of map * @nbits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped * to the n-th set bit in @new. In the more general case, allowing * for the possibility that the weight 'w' of @new is less than the * weight of @old, map the position of the n-th set bit in @old to * the position of the m-th set bit in @new, where m == n % w. * * If either of the @old and @new bitmaps are empty, or if @src and * @dst point to the same location, then this routine copies @src * to @dst. * * The positions of unset bits in @old are mapped to themselves * (the identity map). * * Apply the above specified mapping to @src, placing the result in * @dst, clearing any bits previously set in @dst. * * For example, lets say that @old has bits 4 through 7 set, and * @new has bits 12 through 15 set. This defines the mapping of bit * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other * bit positions unchanged. So if say @src comes into this routine * with bits 1, 5 and 7 set, then @dst should leave with bits 1, * 13 and 15 set. */ void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits) { unsigned int oldbit, w; if (dst == src) /* following doesn't handle inplace remaps */ return; bitmap_zero(dst, nbits); w = bitmap_weight(new, nbits); for_each_set_bit(oldbit, src, nbits) { int n = bitmap_pos_to_ord(old, oldbit, nbits); if (n < 0 || w == 0) set_bit(oldbit, dst); /* identity map */ else set_bit(find_nth_bit(new, nbits, n % w), dst); } } EXPORT_SYMBOL(bitmap_remap); /** * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit * @oldbit: bit position to be mapped * @old: defines domain of map * @new: defines range of map * @bits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped * to the n-th set bit in @new. In the more general case, allowing * for the possibility that the weight 'w' of @new is less than the * weight of @old, map the position of the n-th set bit in @old to * the position of the m-th set bit in @new, where m == n % w. * * The positions of unset bits in @old are mapped to themselves * (the identity map). * * Apply the above specified mapping to bit position @oldbit, returning * the new bit position. * * For example, lets say that @old has bits 4 through 7 set, and * @new has bits 12 through 15 set. This defines the mapping of bit * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other * bit positions unchanged. So if say @oldbit is 5, then this routine * returns 13. */ int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits) { int w = bitmap_weight(new, bits); int n = bitmap_pos_to_ord(old, oldbit, bits); if (n < 0 || w == 0) return oldbit; else return find_nth_bit(new, bits, n % w); } EXPORT_SYMBOL(bitmap_bitremap); #ifdef CONFIG_NUMA /** * bitmap_onto - translate one bitmap relative to another * @dst: resulting translated bitmap * @orig: original untranslated bitmap * @relmap: bitmap relative to which translated * @bits: number of bits in each of these bitmaps * * Set the n-th bit of @dst iff there exists some m such that the * n-th bit of @relmap is set, the m-th bit of @orig is set, and * the n-th bit of @relmap is also the m-th _set_ bit of @relmap. * (If you understood the previous sentence the first time your * read it, you're overqualified for your current job.) * * In other words, @orig is mapped onto (surjectively) @dst, * using the map { <n, m> | the n-th bit of @relmap is the * m-th set bit of @relmap }. * * Any set bits in @orig above bit number W, where W is the * weight of (number of set bits in) @relmap are mapped nowhere. * In particular, if for all bits m set in @orig, m >= W, then * @dst will end up empty. In situations where the possibility * of such an empty result is not desired, one way to avoid it is * to use the bitmap_fold() operator, below, to first fold the * @orig bitmap over itself so that all its set bits x are in the * range 0 <= x < W. The bitmap_fold() operator does this by * setting the bit (m % W) in @dst, for each bit (m) set in @orig. * * Example [1] for bitmap_onto(): * Let's say @relmap has bits 30-39 set, and @orig has bits * 1, 3, 5, 7, 9 and 11 set. Then on return from this routine, * @dst will have bits 31, 33, 35, 37 and 39 set. * * When bit 0 is set in @orig, it means turn on the bit in * @dst corresponding to whatever is the first bit (if any) * that is turned on in @relmap. Since bit 0 was off in the * above example, we leave off that bit (bit 30) in @dst. * * When bit 1 is set in @orig (as in the above example), it * means turn on the bit in @dst corresponding to whatever * is the second bit that is turned on in @relmap. The second * bit in @relmap that was turned on in the above example was * bit 31, so we turned on bit 31 in @dst. * * Similarly, we turned on bits 33, 35, 37 and 39 in @dst, * because they were the 4th, 6th, 8th and 10th set bits * set in @relmap, and the 4th, 6th, 8th and 10th bits of * @orig (i.e. bits 3, 5, 7 and 9) were also set. * * When bit 11 is set in @orig, it means turn on the bit in * @dst corresponding to whatever is the twelfth bit that is * turned on in @relmap. In the above example, there were * only ten bits turned on in @relmap (30..39), so that bit * 11 was set in @orig had no affect on @dst. * * Example [2] for bitmap_fold() + bitmap_onto(): * Let's say @relmap has these ten bits set:: * * 40 41 42 43 45 48 53 61 74 95 * * (for the curious, that's 40 plus the first ten terms of the * Fibonacci sequence.) * * Further lets say we use the following code, invoking * bitmap_fold() then bitmap_onto, as suggested above to * avoid the possibility of an empty @dst result:: * * unsigned long *tmp; // a temporary bitmap's bits * * bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits); * bitmap_onto(dst, tmp, relmap, bits); * * Then this table shows what various values of @dst would be, for * various @orig's. I list the zero-based positions of each set bit. * The tmp column shows the intermediate result, as computed by * using bitmap_fold() to fold the @orig bitmap modulo ten * (the weight of @relmap): * * =============== ============== ================= * @orig tmp @dst * 0 0 40 * 1 1 41 * 9 9 95 * 10 0 40 [#f1]_ * 1 3 5 7 1 3 5 7 41 43 48 61 * 0 1 2 3 4 0 1 2 3 4 40 41 42 43 45 * 0 9 18 27 0 9 8 7 40 61 74 95 * 0 10 20 30 0 40 * 0 11 22 33 0 1 2 3 40 41 42 43 * 0 12 24 36 0 2 4 6 40 42 45 53 * 78 102 211 1 2 8 41 42 74 [#f1]_ * =============== ============== ================= * * .. [#f1] * * For these marked lines, if we hadn't first done bitmap_fold() * into tmp, then the @dst result would have been empty. * * If either of @orig or @relmap is empty (no set bits), then @dst * will be returned empty. * * If (as explained above) the only set bits in @orig are in positions * m where m >= W, (where W is the weight of @relmap) then @dst will * once again be returned empty. * * All bits in @dst not set by the above rule are cleared. */ void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits) { unsigned int n, m; /* same meaning as in above comment */ if (dst == orig) /* following doesn't handle inplace mappings */ return; bitmap_zero(dst, bits); /* * The following code is a more efficient, but less * obvious, equivalent to the loop: * for (m = 0; m < bitmap_weight(relmap, bits); m++) { * n = find_nth_bit(orig, bits, m); * if (test_bit(m, orig)) * set_bit(n, dst); * } */ m = 0; for_each_set_bit(n, relmap, bits) { /* m == bitmap_pos_to_ord(relmap, n, bits) */ if (test_bit(m, orig)) set_bit(n, dst); m++; } } /** * bitmap_fold - fold larger bitmap into smaller, modulo specified size * @dst: resulting smaller bitmap * @orig: original larger bitmap * @sz: specified size * @nbits: number of bits in each of these bitmaps * * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst. * Clear all other bits in @dst. See further the comment and * Example [2] for bitmap_onto() for why and how to use this. */ void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits) { unsigned int oldbit; if (dst == orig) /* following doesn't handle inplace mappings */ return; bitmap_zero(dst, nbits); for_each_set_bit(oldbit, orig, nbits) set_bit(oldbit % sz, dst); } #endif /* CONFIG_NUMA */ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags) { return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags); } EXPORT_SYMBOL(bitmap_alloc); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags) { return bitmap_alloc(nbits, flags | __GFP_ZERO); } EXPORT_SYMBOL(bitmap_zalloc); unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node) { return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long), flags, node); } EXPORT_SYMBOL(bitmap_alloc_node); unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node) { return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node); } EXPORT_SYMBOL(bitmap_zalloc_node); void bitmap_free(const unsigned long *bitmap) { kfree(bitmap); } EXPORT_SYMBOL(bitmap_free); static void devm_bitmap_free(void *data) { unsigned long *bitmap = data; bitmap_free(bitmap); } unsigned long *devm_bitmap_alloc(struct device *dev, unsigned int nbits, gfp_t flags) { unsigned long *bitmap; int ret; bitmap = bitmap_alloc(nbits, flags); if (!bitmap) return NULL; ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap); if (ret) return NULL; return bitmap; } EXPORT_SYMBOL_GPL(devm_bitmap_alloc); unsigned long *devm_bitmap_zalloc(struct device *dev, unsigned int nbits, gfp_t flags) { return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO); } EXPORT_SYMBOL_GPL(devm_bitmap_zalloc); #if BITS_PER_LONG == 64 /** * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap * @bitmap: array of unsigned longs, the destination bitmap * @buf: array of u32 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits) { unsigned int i, halfwords; halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { bitmap[i/2] = (unsigned long) buf[i]; if (++i < halfwords) bitmap[i/2] |= ((unsigned long) buf[i]) << 32; } /* Clear tail bits in last word beyond nbits. */ if (nbits % BITS_PER_LONG) bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits); } EXPORT_SYMBOL(bitmap_from_arr32); /** * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits * @buf: array of u32 (in host byte order), the dest bitmap * @bitmap: array of unsigned longs, the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits) { unsigned int i, halfwords; halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { buf[i] = (u32) (bitmap[i/2] & UINT_MAX); if (++i < halfwords) buf[i] = (u32) (bitmap[i/2] >> 32); } /* Clear tail bits in last element of array beyond nbits. */ if (nbits % BITS_PER_LONG) buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31)); } EXPORT_SYMBOL(bitmap_to_arr32); #endif #if BITS_PER_LONG == 32 /** * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap * @bitmap: array of unsigned longs, the destination bitmap * @buf: array of u64 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits) { int n; for (n = nbits; n > 0; n -= 64) { u64 val = *buf++; *bitmap++ = val; if (n > 32) *bitmap++ = val >> 32; } /* * Clear tail bits in the last word beyond nbits. * * Negative index is OK because here we point to the word next * to the last word of the bitmap, except for nbits == 0, which * is tested implicitly. */ if (nbits % BITS_PER_LONG) bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits); } EXPORT_SYMBOL(bitmap_from_arr64); /** * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits * @buf: array of u64 (in host byte order), the dest bitmap * @bitmap: array of unsigned longs, the source bitmap * @nbits: number of bits in @bitmap */ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits) { const unsigned long *end = bitmap + BITS_TO_LONGS(nbits); while (bitmap < end) { *buf = *bitmap++; if (bitmap < end) *buf |= (u64)(*bitmap++) << 32; buf++; } /* Clear tail bits in the last element of array beyond nbits. */ if (nbits % 64) buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0); } EXPORT_SYMBOL(bitmap_to_arr64); #endif
2 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/leds.h> #include "ieee80211_i.h" #define MAC80211_BLINK_DELAY 50 /* ms */ static inline void ieee80211_led_rx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->rx_led_active)) return; led_trigger_blink_oneshot(&local->rx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } static inline void ieee80211_led_tx(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_LEDS if (!atomic_read(&local->tx_led_active)) return; led_trigger_blink_oneshot(&local->tx_led, MAC80211_BLINK_DELAY, MAC80211_BLINK_DELAY, 0); #endif } #ifdef CONFIG_MAC80211_LEDS void ieee80211_led_assoc(struct ieee80211_local *local, bool associated); void ieee80211_led_radio(struct ieee80211_local *local, bool enabled); void ieee80211_alloc_led_names(struct ieee80211_local *local); void ieee80211_free_led_names(struct ieee80211_local *local); void ieee80211_led_init(struct ieee80211_local *local); void ieee80211_led_exit(struct ieee80211_local *local); void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off); #else static inline void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { } static inline void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { } static inline void ieee80211_alloc_led_names(struct ieee80211_local *local) { } static inline void ieee80211_free_led_names(struct ieee80211_local *local) { } static inline void ieee80211_led_init(struct ieee80211_local *local) { } static inline void ieee80211_led_exit(struct ieee80211_local *local) { } static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { } #endif static inline void ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } static inline void ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * "Ping" sockets * * Based on ipv4/ping.c code. * * Authors: Lorenzo Colitti (IPv6 support) * Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6), * Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32) */ #include <net/addrconf.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/protocol.h> #include <net/udp.h> #include <net/transp_v6.h> #include <linux/proc_fs.h> #include <linux/bpf-cgroup.h> #include <net/ping.h> /* Compatibility glue so we can support IPv6 when it's compiled as a module */ static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { return -EAFNOSUPPORT; } static void dummy_ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { } static int dummy_icmpv6_err_convert(u8 type, u8 code, int *err) { return -EAFNOSUPPORT; } static void dummy_ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) {} static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict) { return 0; } static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from __ip6_datagram_connect() and * intended to prevent BPF program called below from accessing * bytes that are out of the bound specified by user in addr_len. */ if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct icmp6hdr user_icmph; int addr_type; struct in6_addr *daddr; int oif = 0; struct flowi6 fl6; int err; struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; struct ipcm6_cookie ipc6; err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph, sizeof(user_icmph)); if (err) return err; memset(&fl6, 0, sizeof(fl6)); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); if (msg->msg_namelen < sizeof(*u)) return -EINVAL; if (u->sin6_family != AF_INET6) { return -EAFNOSUPPORT; } daddr = &(u->sin6_addr); if (inet6_test_bit(SNDFLOW, sk)) fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr))) oif = u->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (!oif) oif = sk->sk_bound_dev_if; if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif && ipv6_addr_is_multicast(daddr)) oif = np->mcast_oif; else if (!oif) oif = np->ucast_oif; addr_type = ipv6_addr_type(daddr); if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) || (addr_type & IPV6_ADDR_MAPPED) || (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if && l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if)) return -EINVAL; ipcm6_init_sk(&ipc6, sk); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; if (msg->msg_controllen) { struct ipv6_txoptions opt = {}; opt.tot_len = sizeof(opt); ipc6.opt = &opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) return err; /* Changes to txoptions and flow info are not implemented, yet. * Drop the options. */ ipc6.opt = NULL; } fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.saddr = np->saddr; fl6.daddr = *daddr; fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); if (IS_ERR(dst)) return PTR_ERR(dst); rt = (struct rt6_info *) dst; if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = np->mcast_oif; else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; pfh.icmph.type = user_icmph.icmp6_type; pfh.icmph.code = user_icmph.icmp6_code; pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence; pfh.msg = msg; pfh.wcheck = 0; pfh.family = AF_INET6; if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, sizeof(struct icmp6hdr), &ipc6, &fl6, rt, MSG_DONTWAIT); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, (struct icmp6hdr *)&pfh.icmph, len); } release_sock(sk); dst_release(dst); if (err) return err; return len; } struct proto pingv6_prot = { .name = "PINGv6", .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, .pre_connect = ping_v6_pre_connect, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .sendmsg = ping_v6_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct raw6_sock), .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), }; EXPORT_SYMBOL_GPL(pingv6_prot); static struct inet_protosw pingv6_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMPV6, .prot = &pingv6_prot, .ops = &inet6_sockraw_ops, .flags = INET_PROTOSW_REUSE, }; #ifdef CONFIG_PROC_FS static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos) { return ping_seq_start(seq, pos, AF_INET6); } static int ping_v6_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); } else { int bucket = ((struct ping_iter_state *) seq->private)->bucket; struct inet_sock *inet = inet_sk((struct sock *)v); __u16 srcp = ntohs(inet->inet_sport); __u16 destp = ntohs(inet->inet_dport); ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket); } return 0; } static const struct seq_operations ping_v6_seq_ops = { .start = ping_v6_seq_start, .show = ping_v6_seq_show, .next = ping_seq_next, .stop = ping_seq_stop, }; static int __net_init ping_v6_proc_init_net(struct net *net) { if (!proc_create_net("icmp6", 0444, net->proc_net, &ping_v6_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; return 0; } static void __net_exit ping_v6_proc_exit_net(struct net *net) { remove_proc_entry("icmp6", net->proc_net); } static struct pernet_operations ping_v6_net_ops = { .init = ping_v6_proc_init_net, .exit = ping_v6_proc_exit_net, }; #endif int __init pingv6_init(void) { #ifdef CONFIG_PROC_FS int ret = register_pernet_subsys(&ping_v6_net_ops); if (ret) return ret; #endif pingv6_ops.ipv6_recv_error = ipv6_recv_error; pingv6_ops.ip6_datagram_recv_common_ctl = ip6_datagram_recv_common_ctl; pingv6_ops.ip6_datagram_recv_specific_ctl = ip6_datagram_recv_specific_ctl; pingv6_ops.icmpv6_err_convert = icmpv6_err_convert; pingv6_ops.ipv6_icmp_error = ipv6_icmp_error; pingv6_ops.ipv6_chk_addr = ipv6_chk_addr; return inet6_register_protosw(&pingv6_protosw); } /* This never gets called because it's not possible to unload the ipv6 module, * but just in case. */ void pingv6_exit(void) { pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error; pingv6_ops.ip6_datagram_recv_common_ctl = dummy_ip6_datagram_recv_ctl; pingv6_ops.ip6_datagram_recv_specific_ctl = dummy_ip6_datagram_recv_ctl; pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert; pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error; pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr; #ifdef CONFIG_PROC_FS unregister_pernet_subsys(&ping_v6_net_ops); #endif inet6_unregister_protosw(&pingv6_protosw); }
22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IRQ_H #define _LINUX_IRQ_H /* * Please do not include this file in generic code. There is currently * no requirement for any architecture to implement anything held * within this file. * * Thanks. --rmk */ #include <linux/cache.h> #include <linux/spinlock.h> #include <linux/cpumask.h> #include <linux/irqhandler.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/topology.h> #include <linux/io.h> #include <linux/slab.h> #include <asm/irq.h> #include <asm/ptrace.h> #include <asm/irq_regs.h> struct seq_file; struct module; struct msi_msg; struct irq_affinity_desc; enum irqchip_irq_state; /* * IRQ line status. * * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h * * IRQ_TYPE_NONE - default, unspecified type * IRQ_TYPE_EDGE_RISING - rising edge triggered * IRQ_TYPE_EDGE_FALLING - falling edge triggered * IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered * IRQ_TYPE_LEVEL_HIGH - high level triggered * IRQ_TYPE_LEVEL_LOW - low level triggered * IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits * IRQ_TYPE_SENSE_MASK - Mask for all the above bits * IRQ_TYPE_DEFAULT - For use by some PICs to ask irq_set_type * to setup the HW to a sane default (used * by irqdomain map() callbacks to synchronize * the HW state and SW flags for a newly * allocated descriptor). * * IRQ_TYPE_PROBE - Special flag for probing in progress * * Bits which can be modified via irq_set/clear/modify_status_flags() * IRQ_LEVEL - Interrupt is level type. Will be also * updated in the code when the above trigger * bits are modified via irq_set_irq_type() * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect * it from affinity setting * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing * IRQ_NOREQUEST - Interrupt cannot be requested via * request_irq() * IRQ_NOTHREAD - Interrupt cannot be threaded * IRQ_NOAUTOEN - Interrupt is not automatically enabled in * request/setup_irq() * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) * IRQ_MOVE_PCNTXT - Interrupt can be migrated from process context * IRQ_NESTED_THREAD - Interrupt nests into another thread * IRQ_PER_CPU_DEVID - Dev_id is a per-cpu variable * IRQ_IS_POLLED - Always polled by another interrupt. Exclude * it from the spurious interrupt detection * mechanism and from core side polling. * IRQ_DISABLE_UNLAZY - Disable lazy irq disable * IRQ_HIDDEN - Don't show up in /proc/interrupts * IRQ_NO_DEBUG - Exclude from note_interrupt() debugging */ enum { IRQ_TYPE_NONE = 0x00000000, IRQ_TYPE_EDGE_RISING = 0x00000001, IRQ_TYPE_EDGE_FALLING = 0x00000002, IRQ_TYPE_EDGE_BOTH = (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING), IRQ_TYPE_LEVEL_HIGH = 0x00000004, IRQ_TYPE_LEVEL_LOW = 0x00000008, IRQ_TYPE_LEVEL_MASK = (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH), IRQ_TYPE_SENSE_MASK = 0x0000000f, IRQ_TYPE_DEFAULT = IRQ_TYPE_SENSE_MASK, IRQ_TYPE_PROBE = 0x00000010, IRQ_LEVEL = (1 << 8), IRQ_PER_CPU = (1 << 9), IRQ_NOPROBE = (1 << 10), IRQ_NOREQUEST = (1 << 11), IRQ_NOAUTOEN = (1 << 12), IRQ_NO_BALANCING = (1 << 13), IRQ_MOVE_PCNTXT = (1 << 14), IRQ_NESTED_THREAD = (1 << 15), IRQ_NOTHREAD = (1 << 16), IRQ_PER_CPU_DEVID = (1 << 17), IRQ_IS_POLLED = (1 << 18), IRQ_DISABLE_UNLAZY = (1 << 19), IRQ_HIDDEN = (1 << 20), IRQ_NO_DEBUG = (1 << 21), }; #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_HIDDEN) #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) /* * Return value for chip->irq_set_affinity() * * IRQ_SET_MASK_OK - OK, core updates irq_common_data.affinity * IRQ_SET_MASK_NOCPY - OK, chip did update irq_common_data.affinity * IRQ_SET_MASK_OK_DONE - Same as IRQ_SET_MASK_OK for core. Special code to * support stacked irqchips, which indicates skipping * all descendant irqchips. */ enum { IRQ_SET_MASK_OK = 0, IRQ_SET_MASK_OK_NOCOPY, IRQ_SET_MASK_OK_DONE, }; struct msi_desc; struct irq_domain; /** * struct irq_common_data - per irq data shared by all irqchips * @state_use_accessors: status information for irq chip functions. * Use accessor functions to deal with it * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods * @affinity: IRQ affinity on SMP. If this is an IPI * related irq, then this is the mask of the * CPUs to which an IPI can be sent. * @effective_affinity: The effective IRQ affinity on SMP as some irq * chips do not allow multi CPU destinations. * A subset of @affinity. * @msi_desc: MSI descriptor * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ struct irq_common_data { unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif void *handler_data; struct msi_desc *msi_desc; #ifdef CONFIG_SMP cpumask_var_t affinity; #endif #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK cpumask_var_t effective_affinity; #endif #ifdef CONFIG_GENERIC_IRQ_IPI unsigned int ipi_offset; #endif }; /** * struct irq_data - per irq chip data passed down to chip functions * @mask: precomputed bitmask for accessing the chip registers * @irq: interrupt number * @hwirq: hardware interrupt number, local to the interrupt domain * @common: point to data shared by all irqchips * @chip: low level interrupt hardware access * @domain: Interrupt translation domain; responsible for mapping * between hwirq number and linux irq number. * @parent_data: pointer to parent struct irq_data to support hierarchy * irq_domain * @chip_data: platform-specific per-chip private data for the chip * methods, to allow shared chip implementations */ struct irq_data { u32 mask; unsigned int irq; unsigned long hwirq; struct irq_common_data *common; struct irq_chip *chip; struct irq_domain *domain; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data *parent_data; #endif void *chip_data; }; /* * Bit masks for irq_common_data.state_use_accessors * * IRQD_TRIGGER_MASK - Mask for the trigger type bits * IRQD_SETAFFINITY_PENDING - Affinity setting is pending * IRQD_ACTIVATED - Interrupt has already been activated * IRQD_NO_BALANCING - Balancing disabled for this IRQ * IRQD_PER_CPU - Interrupt is per cpu * IRQD_AFFINITY_SET - Interrupt affinity was set * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend * IRQD_MOVE_PCNTXT - Interrupt can be moved in process * context * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt * IRQD_WAKEUP_ARMED - Wakeup mode armed * IRQD_FORWARDED_TO_VCPU - The interrupt is forwarded to a VCPU * IRQD_AFFINITY_MANAGED - Affinity is auto-managed by the kernel * IRQD_IRQ_STARTED - Startup state of the interrupt * IRQD_MANAGED_SHUTDOWN - Interrupt was shutdown due to empty affinity * mask. Applies only to affinity managed irqs. * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set * IRQD_CAN_RESERVE - Can use reservation mode * IRQD_HANDLE_ENFORCE_IRQCTX - Enforce that handle_irq_*() is only invoked * from actual interrupt context. * IRQD_AFFINITY_ON_ACTIVATE - Affinity is set on activation. Don't call * irq_chip::irq_set_affinity() when deactivated. * IRQD_IRQ_ENABLED_ON_SUSPEND - Interrupt is enabled on suspend by irq pm if * irqchip have flag IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND set. * IRQD_RESEND_WHEN_IN_PROGRESS - Interrupt may fire when already in progress in which * case it must be resent at the next available opportunity. */ enum { IRQD_TRIGGER_MASK = 0xf, IRQD_SETAFFINITY_PENDING = BIT(8), IRQD_ACTIVATED = BIT(9), IRQD_NO_BALANCING = BIT(10), IRQD_PER_CPU = BIT(11), IRQD_AFFINITY_SET = BIT(12), IRQD_LEVEL = BIT(13), IRQD_WAKEUP_STATE = BIT(14), IRQD_MOVE_PCNTXT = BIT(15), IRQD_IRQ_DISABLED = BIT(16), IRQD_IRQ_MASKED = BIT(17), IRQD_IRQ_INPROGRESS = BIT(18), IRQD_WAKEUP_ARMED = BIT(19), IRQD_FORWARDED_TO_VCPU = BIT(20), IRQD_AFFINITY_MANAGED = BIT(21), IRQD_IRQ_STARTED = BIT(22), IRQD_MANAGED_SHUTDOWN = BIT(23), IRQD_SINGLE_TARGET = BIT(24), IRQD_DEFAULT_TRIGGER_SET = BIT(25), IRQD_CAN_RESERVE = BIT(26), IRQD_HANDLE_ENFORCE_IRQCTX = BIT(27), IRQD_AFFINITY_ON_ACTIVATE = BIT(28), IRQD_IRQ_ENABLED_ON_SUSPEND = BIT(29), IRQD_RESEND_WHEN_IN_PROGRESS = BIT(30), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SETAFFINITY_PENDING; } static inline bool irqd_is_per_cpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_PER_CPU; } static inline bool irqd_can_balance(struct irq_data *d) { return !(__irqd_to_state(d) & (IRQD_PER_CPU | IRQD_NO_BALANCING)); } static inline bool irqd_affinity_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_SET; } static inline void irqd_mark_affinity_was_set(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_SET; } static inline bool irqd_trigger_type_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_DEFAULT_TRIGGER_SET; } static inline u32 irqd_get_trigger_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_TRIGGER_MASK; } /* * Must only be called inside irq_chip.irq_set_type() functions or * from the DT/ACPI setup code. */ static inline void irqd_set_trigger_type(struct irq_data *d, u32 type) { __irqd_to_state(d) &= ~IRQD_TRIGGER_MASK; __irqd_to_state(d) |= type & IRQD_TRIGGER_MASK; __irqd_to_state(d) |= IRQD_DEFAULT_TRIGGER_SET; } static inline bool irqd_is_level_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_LEVEL; } /* * Must only be called of irqchip.irq_set_affinity() or low level * hierarchy domain allocation functions. */ static inline void irqd_set_single_target(struct irq_data *d) { __irqd_to_state(d) |= IRQD_SINGLE_TARGET; } static inline bool irqd_is_single_target(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SINGLE_TARGET; } static inline void irqd_set_handle_enforce_irqctx(struct irq_data *d) { __irqd_to_state(d) |= IRQD_HANDLE_ENFORCE_IRQCTX; } static inline bool irqd_is_handle_enforce_irqctx(struct irq_data *d) { return __irqd_to_state(d) & IRQD_HANDLE_ENFORCE_IRQCTX; } static inline bool irqd_is_enabled_on_suspend(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_ENABLED_ON_SUSPEND; } static inline bool irqd_is_wakeup_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_STATE; } static inline bool irqd_can_move_in_process_context(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MOVE_PCNTXT; } static inline bool irqd_irq_disabled(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_DISABLED; } static inline bool irqd_irq_masked(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_MASKED; } static inline bool irqd_irq_inprogress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_INPROGRESS; } static inline bool irqd_is_wakeup_armed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_ARMED; } static inline bool irqd_is_forwarded_to_vcpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_FORWARDED_TO_VCPU; } static inline void irqd_set_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) |= IRQD_FORWARDED_TO_VCPU; } static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } static inline bool irqd_affinity_is_managed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED; } static inline bool irqd_is_activated(struct irq_data *d) { return __irqd_to_state(d) & IRQD_ACTIVATED; } static inline void irqd_set_activated(struct irq_data *d) { __irqd_to_state(d) |= IRQD_ACTIVATED; } static inline void irqd_clr_activated(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_ACTIVATED; } static inline bool irqd_is_started(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_STARTED; } static inline bool irqd_is_managed_and_shutdown(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } static inline void irqd_set_can_reserve(struct irq_data *d) { __irqd_to_state(d) |= IRQD_CAN_RESERVE; } static inline void irqd_clr_can_reserve(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; } static inline bool irqd_can_reserve(struct irq_data *d) { return __irqd_to_state(d) & IRQD_CAN_RESERVE; } static inline void irqd_set_affinity_on_activate(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE; } static inline bool irqd_affinity_on_activate(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; } static inline void irqd_set_resend_when_in_progress(struct irq_data *d) { __irqd_to_state(d) |= IRQD_RESEND_WHEN_IN_PROGRESS; } static inline bool irqd_needs_resend_when_in_progress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_RESEND_WHEN_IN_PROGRESS; } #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; } /** * struct irq_chip - hardware interrupt chip descriptor * * @name: name for /proc/interrupts * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) * @irq_enable: enable the interrupt (defaults to chip->unmask if NULL) * @irq_disable: disable the interrupt * @irq_ack: start of a new interrupt * @irq_mask: mask an interrupt source * @irq_mask_ack: ack and mask an interrupt source * @irq_unmask: unmask an interrupt source * @irq_eoi: end of interrupt * @irq_set_affinity: Set the CPU affinity on SMP machines. If the force * argument is true, it tells the driver to * unconditionally apply the affinity setting. Sanity * checks against the supplied affinity mask are not * required. This is used for CPU hotplug where the * target CPU is not yet set in the cpu_online_mask. * @irq_retrigger: resend an IRQ to the CPU * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips * @irq_cpu_online: configure an interrupt source for a secondary CPU * @irq_cpu_offline: un-configure an interrupt source for a secondary CPU * @irq_suspend: function called from core code on suspend once per * chip, when one or more interrupts are installed * @irq_resume: function called from core code on resume once per chip, * when one ore more interrupts are installed * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_calc_mask: Optional function to set irq_data.mask for special cases * @irq_print_chip: optional to print special chip info in show_interrupts * @irq_request_resources: optional to request resources before calling * any other callback related to this irq * @irq_release_resources: optional to release resources acquired with * irq_request_resources * @irq_compose_msi_msg: optional to compose message content for MSI * @irq_write_msi_msg: optional to write message content for MSI * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine * @ipi_send_single: send a single IPI to destination cpus * @ipi_send_mask: send an IPI to destination cpus in cpumask * @irq_nmi_setup: function called from core code before enabling an NMI * @irq_nmi_teardown: function called from core code after disabling an NMI * @flags: chip specific flags */ struct irq_chip { const char *name; unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); void (*irq_enable)(struct irq_data *data); void (*irq_disable)(struct irq_data *data); void (*irq_ack)(struct irq_data *data); void (*irq_mask)(struct irq_data *data); void (*irq_mask_ack)(struct irq_data *data); void (*irq_unmask)(struct irq_data *data); void (*irq_eoi)(struct irq_data *data); int (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force); int (*irq_retrigger)(struct irq_data *data); int (*irq_set_type)(struct irq_data *data, unsigned int flow_type); int (*irq_set_wake)(struct irq_data *data, unsigned int on); void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); #endif void (*irq_suspend)(struct irq_data *data); void (*irq_resume)(struct irq_data *data); void (*irq_pm_shutdown)(struct irq_data *data); void (*irq_calc_mask)(struct irq_data *data); void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); int (*irq_request_resources)(struct irq_data *data); void (*irq_release_resources)(struct irq_data *data); void (*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg); void (*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg); int (*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state); int (*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state); int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); void (*ipi_send_single)(struct irq_data *data, unsigned int cpu); void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest); int (*irq_nmi_setup)(struct irq_data *data); void (*irq_nmi_teardown)(struct irq_data *data); unsigned long flags; }; /* * irq_chip specific flags * * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path * IRQCHIP_ONOFFLINE_ENABLED: Only call irq_on/off_line callbacks * when irq enabled * IRQCHIP_SKIP_SET_WAKE: Skip chip.irq_set_wake(), for this irq chip * IRQCHIP_ONESHOT_SAFE: One shot does not require mask/unmask * IRQCHIP_EOI_THREADED: Chip requires eoi() on unmask in threaded mode * IRQCHIP_SUPPORTS_LEVEL_MSI: Chip can provide two doorbells for Level MSIs * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup * IRQCHIP_IMMUTABLE: Don't ever change anything in this chip */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), IRQCHIP_EOI_IF_HANDLED = (1 << 1), IRQCHIP_MASK_ON_SUSPEND = (1 << 2), IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), IRQCHIP_ONESHOT_SAFE = (1 << 5), IRQCHIP_EOI_THREADED = (1 << 6), IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), IRQCHIP_IMMUTABLE = (1 << 11), }; #include <linux/irqdesc.h> /* * Pick up the arch-dependent methods: */ #include <asm/hw_irq.h> #ifndef NR_IRQS_LEGACY # define NR_IRQS_LEGACY 0 #endif #ifndef ARCH_IRQ_INIT_FLAGS # define ARCH_IRQ_INIT_FLAGS 0 #endif #define IRQ_DEFAULT_INIT_FLAGS ARCH_IRQ_INIT_FLAGS struct irqaction; extern int setup_percpu_irq(unsigned int irq, struct irqaction *new); extern void remove_percpu_irq(unsigned int irq, struct irqaction *act); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE extern void irq_cpu_online(void); extern void irq_cpu_offline(void); #endif extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_IRQ_MIGRATION) extern void irq_migrate_all_off_this_cpu(void); extern int irq_affinity_online_cpu(unsigned int cpu); #else # define irq_affinity_online_cpu NULL #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { if (unlikely(irqd_is_setaffinity_pending(data))) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); void irq_force_complete_move(struct irq_desc *desc); #else static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif extern int no_irq_affinity; #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq); #else static inline int irq_set_parent(int irq, int parent_irq) { return 0; } #endif /* * Built-in IRQ handlers for various IRQ types, * callable via desc->handle_irq() */ extern void handle_level_irq(struct irq_desc *desc); extern void handle_fasteoi_irq(struct irq_desc *desc); extern void handle_edge_irq(struct irq_desc *desc); extern void handle_edge_eoi_irq(struct irq_desc *desc); extern void handle_simple_irq(struct irq_desc *desc); extern void handle_untracked_irq(struct irq_desc *desc); extern void handle_percpu_irq(struct irq_desc *desc); extern void handle_percpu_devid_irq(struct irq_desc *desc); extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern void handle_fasteoi_nmi(struct irq_desc *desc); extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); extern int irq_chip_pm_put(struct irq_data *data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern void handle_fasteoi_ack_irq(struct irq_desc *desc); extern void handle_fasteoi_mask_irq(struct irq_desc *desc); extern int irq_chip_set_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool val); extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); extern int irq_chip_retrigger_hierarchy(struct irq_data *data); extern void irq_chip_mask_parent(struct irq_data *data); extern void irq_chip_mask_ack_parent(struct irq_data *data); extern void irq_chip_unmask_parent(struct irq_data *data); extern void irq_chip_eoi_parent(struct irq_data *data); extern int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force); extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on); extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info); extern int irq_chip_set_type_parent(struct irq_data *data, unsigned int type); extern int irq_chip_request_resources_parent(struct irq_data *data); extern void irq_chip_release_resources_parent(struct irq_data *data); #endif /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); /* Enable/disable irq debugging output: */ extern int noirqdebug_setup(char *str); /* Checks whether the interrupt can be requested by request_irq(): */ extern int can_request_irq(unsigned int irq, unsigned long irqflags); /* Dummy irq-chip implementations: */ extern struct irq_chip no_irq_chip; extern struct irq_chip dummy_irq_chip; extern void irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name); static inline void irq_set_chip_and_handler(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle) { irq_set_chip_and_handler_name(irq, chip, handle, NULL); } extern int irq_set_percpu_devid(unsigned int irq); extern int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity); extern int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); static inline void irq_set_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 0, NULL); } /* * Set a highlevel chained flow handler for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ static inline void irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 1, NULL); } /* * Set a highlevel chained flow handler and its data for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, void *data); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); static inline void irq_set_status_flags(unsigned int irq, unsigned long set) { irq_modify_status(irq, 0, set); } static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr) { irq_modify_status(irq, clr, 0); } static inline void irq_set_noprobe(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOPROBE); } static inline void irq_set_probe(unsigned int irq) { irq_modify_status(irq, IRQ_NOPROBE, 0); } static inline void irq_set_nothread(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOTHREAD); } static inline void irq_set_thread(unsigned int irq) { irq_modify_status(irq, IRQ_NOTHREAD, 0); } static inline void irq_set_nested_thread(unsigned int irq, bool nest) { if (nest) irq_set_status_flags(irq, IRQ_NESTED_THREAD); else irq_clear_status_flags(irq, IRQ_NESTED_THREAD); } static inline void irq_set_percpu_devid_flags(unsigned int irq) { irq_set_status_flags(irq, IRQ_NOAUTOEN | IRQ_PER_CPU | IRQ_NOTHREAD | IRQ_NOPROBE | IRQ_PER_CPU_DEVID); } /* Set/get chip/data for an IRQ: */ extern int irq_set_chip(unsigned int irq, const struct irq_chip *chip); extern int irq_set_handler_data(unsigned int irq, void *data); extern int irq_set_chip_data(unsigned int irq, void *data); extern int irq_set_irq_type(unsigned int irq, unsigned int type); extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip : NULL; } static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d) { return d->chip; } static inline void *irq_get_chip_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip_data : NULL; } static inline void *irq_data_get_irq_chip_data(struct irq_data *d) { return d->chip_data; } static inline void *irq_get_handler_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->handler_data : NULL; } static inline void *irq_data_get_irq_handler_data(struct irq_data *d) { return d->common->handler_data; } static inline struct msi_desc *irq_get_msi_desc(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->msi_desc : NULL; } static inline struct msi_desc *irq_data_get_msi_desc(struct irq_data *d) { return d->common->msi_desc; } static inline u32 irq_get_trigger_type(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irqd_get_trigger_type(d) : 0; } static inline int irq_common_data_get_node(struct irq_common_data *d) { #ifdef CONFIG_NUMA return d->node; #else return 0; #endif } static inline int irq_data_get_node(struct irq_data *d) { return irq_common_data_get_node(d->common); } static inline const struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) { #ifdef CONFIG_SMP return d->common->affinity; #else return cpumask_of(0); #endif } static inline void irq_data_update_affinity(struct irq_data *d, const struct cpumask *m) { #ifdef CONFIG_SMP cpumask_copy(d->common->affinity, m); #endif } static inline const struct cpumask *irq_get_affinity_mask(int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irq_data_get_affinity_mask(d) : NULL; } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static inline const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return d->common->effective_affinity; } static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { cpumask_copy(d->common->effective_affinity, m); } #else static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { } static inline const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return irq_data_get_affinity_mask(d); } #endif static inline const struct cpumask *irq_get_effective_affinity_mask(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irq_data_get_effective_affinity_mask(d) : NULL; } unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity); int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL) #define irq_alloc_desc(node) \ irq_alloc_descs(-1, 1, 1, node) #define irq_alloc_desc_at(at, node) \ irq_alloc_descs(at, at, 1, node) #define irq_alloc_desc_from(from, node) \ irq_alloc_descs(-1, from, 1, node) #define irq_alloc_descs_from(from, cnt, node) \ irq_alloc_descs(-1, from, cnt, node) #define devm_irq_alloc_descs(dev, irq, from, cnt, node) \ __devm_irq_alloc_descs(dev, irq, from, cnt, node, THIS_MODULE, NULL) #define devm_irq_alloc_desc(dev, node) \ devm_irq_alloc_descs(dev, -1, 1, 1, node) #define devm_irq_alloc_desc_at(dev, at, node) \ devm_irq_alloc_descs(dev, at, at, 1, node) #define devm_irq_alloc_desc_from(dev, from, node) \ devm_irq_alloc_descs(dev, -1, from, 1, node) #define devm_irq_alloc_descs_from(dev, from, cnt, node) \ devm_irq_alloc_descs(dev, -1, from, cnt, node) void irq_free_descs(unsigned int irq, unsigned int cnt); static inline void irq_free_desc(unsigned int irq) { irq_free_descs(irq, 1); } #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq); #endif /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base * @disable: Disable register offset to reg_base * @mask: Mask register offset to reg_base * @ack: Ack register offset to reg_base * @eoi: Eoi register offset to reg_base * @type: Type configuration register offset to reg_base * @polarity: Polarity configuration register offset to reg_base */ struct irq_chip_regs { unsigned long enable; unsigned long disable; unsigned long mask; unsigned long ack; unsigned long eoi; unsigned long type; unsigned long polarity; }; /** * struct irq_chip_type - Generic interrupt chip instance for a flow type * @chip: The real interrupt chip which provides the callbacks * @regs: Register offsets for this chip * @handler: Flow handler associated with this chip * @type: Chip can handle these flow types * @mask_cache_priv: Cached mask register private to the chip type * @mask_cache: Pointer to cached mask register * * A irq_generic_chip can have several instances of irq_chip_type when * it requires different functions and register offsets for different * flow types. */ struct irq_chip_type { struct irq_chip chip; struct irq_chip_regs regs; irq_flow_handler_t handler; u32 type; u32 mask_cache_priv; u32 *mask_cache; }; /** * struct irq_chip_generic - Generic irq chip data structure * @lock: Lock to protect register and cache data access * @reg_base: Register base address (virtual) * @reg_readl: Alternate I/O accessor (defaults to readl if NULL) * @reg_writel: Alternate I/O accessor (defaults to writel if NULL) * @suspend: Function called from core code on suspend once per * chip; can be useful instead of irq_chip::suspend to * handle chip details even when no interrupts are in use * @resume: Function called from core code on resume once per chip; * can be useful instead of irq_chip::suspend to handle * chip details even when no interrupts are in use * @irq_base: Interrupt base nr for this chip * @irq_cnt: Number of interrupts handled by this chip * @mask_cache: Cached mask register shared between all chip types * @type_cache: Cached type register * @polarity_cache: Cached polarity register * @wake_enabled: Interrupt can wakeup from suspend * @wake_active: Interrupt is marked as an wakeup from suspend source * @num_ct: Number of available irq_chip_type instances (usually 1) * @private: Private data for non generic chip callbacks * @installed: bitfield to denote installed interrupts * @unused: bitfield to denote unused interrupts * @domain: irq domain pointer * @list: List head for keeping track of instances * @chip_types: Array of interrupt irq_chip_types * * Note, that irq_chip_generic can have multiple irq_chip_type * implementations which can be associated to a particular irq line of * an irq_chip_generic instance. That allows to share and protect * state in an irq_chip_generic instance when we need to implement * different flow mechanisms (level/edge) for it. */ struct irq_chip_generic { raw_spinlock_t lock; void __iomem *reg_base; u32 (*reg_readl)(void __iomem *addr); void (*reg_writel)(u32 val, void __iomem *addr); void (*suspend)(struct irq_chip_generic *gc); void (*resume)(struct irq_chip_generic *gc); unsigned int irq_base; unsigned int irq_cnt; u32 mask_cache; u32 type_cache; u32 polarity_cache; u32 wake_enabled; u32 wake_active; unsigned int num_ct; void *private; unsigned long installed; unsigned long unused; struct irq_domain *domain; struct list_head list; struct irq_chip_type chip_types[]; }; /** * enum irq_gc_flags - Initialization flags for generic irq chips * @IRQ_GC_INIT_MASK_CACHE: Initialize the mask_cache by reading mask reg * @IRQ_GC_INIT_NESTED_LOCK: Set the lock class of the irqs to nested for * irq chips which need to call irq_set_wake() on * the parent irq. Usually GPIO implementations * @IRQ_GC_MASK_CACHE_PER_TYPE: Mask cache is chip type private * @IRQ_GC_NO_MASK: Do not calculate irq_data->mask * @IRQ_GC_BE_IO: Use big-endian register accesses (default: LE) */ enum irq_gc_flags { IRQ_GC_INIT_MASK_CACHE = 1 << 0, IRQ_GC_INIT_NESTED_LOCK = 1 << 1, IRQ_GC_MASK_CACHE_PER_TYPE = 1 << 2, IRQ_GC_NO_MASK = 1 << 3, IRQ_GC_BE_IO = 1 << 4, }; /* * struct irq_domain_chip_generic - Generic irq chip data structure for irq domains * @irqs_per_chip: Number of interrupts per chip * @num_chips: Number of chips * @irq_flags_to_set: IRQ* flags to set on irq setup * @irq_flags_to_clear: IRQ* flags to clear on irq setup * @gc_flags: Generic chip specific setup flags * @gc: Array of pointers to generic interrupt chips */ struct irq_domain_chip_generic { unsigned int irqs_per_chip; unsigned int num_chips; unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; struct irq_chip_generic *gc[]; }; /* Generic chip callback functions */ void irq_gc_noop(struct irq_data *d); void irq_gc_mask_disable_reg(struct irq_data *d); void irq_gc_mask_set_bit(struct irq_data *d); void irq_gc_mask_clr_bit(struct irq_data *d); void irq_gc_unmask_enable_reg(struct irq_data *d); void irq_gc_ack_set_bit(struct irq_data *d); void irq_gc_ack_clr_bit(struct irq_data *d); void irq_gc_mask_disable_and_ack_set(struct irq_data *d); void irq_gc_eoi(struct irq_data *d); int irq_gc_set_wake(struct irq_data *d, unsigned int on); /* Setup functions for irq_chip_generic */ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw_irq); void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq); struct irq_chip_generic * irq_alloc_generic_chip(const char *name, int nr_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); int irq_setup_alt_chip(struct irq_data *d, unsigned int type); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); struct irq_chip_generic * devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, irq_flow_handler_t handler, unsigned int clr, unsigned int set, enum irq_gc_flags flags); #define irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name, \ handler, clr, set, flags) \ ({ \ MAYBE_BUILD_BUG_ON(irqs_per_chip > 32); \ __irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name,\ handler, clr, set, flags); \ }) static inline void irq_free_generic_chip(struct irq_chip_generic *gc) { kfree(gc); } static inline void irq_destroy_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set) { irq_remove_generic_chip(gc, msk, clr, set); irq_free_generic_chip(gc); } static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { return container_of(d->chip, struct irq_chip_type, chip); } #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) #ifdef CONFIG_SMP static inline void irq_gc_lock(struct irq_chip_generic *gc) { raw_spin_lock(&gc->lock); } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { raw_spin_unlock(&gc->lock); } #else static inline void irq_gc_lock(struct irq_chip_generic *gc) { } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } #endif /* * The irqsave variants are for usage in non interrupt code. Do not use * them in irq_chip callbacks. Use irq_gc_lock() instead. */ #define irq_gc_lock_irqsave(gc, flags) \ raw_spin_lock_irqsave(&(gc)->lock, flags) #define irq_gc_unlock_irqrestore(gc, flags) \ raw_spin_unlock_irqrestore(&(gc)->lock, flags) static inline void irq_reg_writel(struct irq_chip_generic *gc, u32 val, int reg_offset) { if (gc->reg_writel) gc->reg_writel(val, gc->reg_base + reg_offset); else writel(val, gc->reg_base + reg_offset); } static inline u32 irq_reg_readl(struct irq_chip_generic *gc, int reg_offset) { if (gc->reg_readl) return gc->reg_readl(gc->reg_base + reg_offset); else return readl(gc->reg_base + reg_offset); } struct irq_matrix; struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, unsigned int alloc_start, unsigned int alloc_end); void irq_matrix_online(struct irq_matrix *m); void irq_matrix_offline(struct irq_matrix *m); void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace); int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk); void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk); int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu); void irq_matrix_reserve(struct irq_matrix *m); void irq_matrix_remove_reserved(struct irq_matrix *m); int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu); void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, unsigned int bit, bool managed); void irq_matrix_assign(struct irq_matrix *m, unsigned int bit); unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown); unsigned int irq_matrix_allocated(struct irq_matrix *m); unsigned int irq_matrix_reserved(struct irq_matrix *m); void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind); /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); int ipi_send_single(unsigned int virq, unsigned int cpu); int ipi_send_mask(unsigned int virq, const struct cpumask *dest); void ipi_mux_process(void); int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)); #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER /* * Registers a generic IRQ handling function as the top-level IRQ handler in * the system, which is generally the first C code called from an assembly * architecture-specific interrupt handler. * * Returns 0 on success, or -EBUSY if an IRQ handler has already been * registered. */ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)); /* * Allows interrupt handlers to find the irqchip that's been registered as the * top-level IRQ handler. */ extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; asmlinkage void generic_handle_arch_irq(struct pt_regs *regs); #else #ifndef set_handle_irq #define set_handle_irq(handle_irq) \ do { \ (void)handle_irq; \ WARN_ON(1); \ } while (0) #endif #endif #endif /* _LINUX_IRQ_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" struct devlink_linecard { struct list_head list; struct devlink *devlink; unsigned int index; const struct devlink_linecard_ops *ops; void *priv; enum devlink_linecard_state state; struct mutex state_lock; /* Protects state */ const char *type; struct devlink_linecard_type *types; unsigned int types_count; u32 rel_index; }; unsigned int devlink_linecard_index(struct devlink_linecard *linecard) { return linecard->index; } static struct devlink_linecard * devlink_linecard_get_by_index(struct devlink *devlink, unsigned int linecard_index) { struct devlink_linecard *devlink_linecard; list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) { if (devlink_linecard->index == linecard_index) return devlink_linecard; } return NULL; } static bool devlink_linecard_index_exists(struct devlink *devlink, unsigned int linecard_index) { return devlink_linecard_get_by_index(devlink, linecard_index); } static struct devlink_linecard * devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) { u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (!linecard) return ERR_PTR(-ENODEV); return linecard; } return ERR_PTR(-EINVAL); } static struct devlink_linecard * devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_linecard_get_from_attrs(devlink, info->attrs); } struct devlink_linecard_type { const char *type; const void *priv; }; static int devlink_nl_linecard_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_linecard *linecard, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink_linecard_type *linecard_type; struct nlattr *attr; void *hdr; int i; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state)) goto nla_put_failure; if (linecard->type && nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type)) goto nla_put_failure; if (linecard->types_count) { attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES); if (!attr) goto nla_put_failure; for (i = 0; i < linecard->types_count; i++) { linecard_type = &linecard->types[i]; if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard_type->type)) { nla_nest_cancel(msg, attr); goto nla_put_failure; } } nla_nest_end(msg, attr); } if (devlink_rel_devlink_handle_put(msg, devlink, linecard->rel_index, DEVLINK_ATTR_NESTED_DEVLINK, NULL)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_linecard_notify(struct devlink_linecard *linecard, enum devlink_command cmd) { struct devlink *devlink = linecard->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && cmd != DEVLINK_CMD_LINECARD_DEL); if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } void devlink_linecards_notify_register(struct devlink *devlink) { struct devlink_linecard *linecard; list_for_each_entry(linecard, &devlink->linecard_list, list) devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); } void devlink_linecards_notify_unregister(struct devlink *devlink) { struct devlink_linecard *linecard; list_for_each_entry_reverse(linecard, &devlink->linecard_list, list) devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); } int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_linecard *linecard; struct sk_buff *msg; int err; linecard = devlink_linecard_get_from_info(devlink, info); if (IS_ERR(linecard)) return PTR_ERR(linecard); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; mutex_lock(&linecard->state_lock); err = devlink_nl_linecard_fill(msg, devlink, linecard, DEVLINK_CMD_LINECARD_NEW, info->snd_portid, info->snd_seq, 0, info->extack); mutex_unlock(&linecard->state_lock); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_linecard *linecard; int idx = 0; int err = 0; list_for_each_entry(linecard, &devlink->linecard_list, list) { if (idx < state->idx) { idx++; continue; } mutex_lock(&linecard->state_lock); err = devlink_nl_linecard_fill(msg, devlink, linecard, DEVLINK_CMD_LINECARD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); mutex_unlock(&linecard->state_lock); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_linecard_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one); } static struct devlink_linecard_type * devlink_linecard_type_lookup(struct devlink_linecard *linecard, const char *type) { struct devlink_linecard_type *linecard_type; int i; for (i = 0; i < linecard->types_count; i++) { linecard_type = &linecard->types[i]; if (!strcmp(type, linecard_type->type)) return linecard_type; } return NULL; } static int devlink_linecard_type_set(struct devlink_linecard *linecard, const char *type, struct netlink_ext_ack *extack) { const struct devlink_linecard_ops *ops = linecard->ops; struct devlink_linecard_type *linecard_type; int err; mutex_lock(&linecard->state_lock); if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); err = -EBUSY; goto out; } linecard_type = devlink_linecard_type_lookup(linecard, type); if (!linecard_type) { NL_SET_ERR_MSG(extack, "Unsupported line card type provided"); err = -EINVAL; goto out; } if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { NL_SET_ERR_MSG(extack, "Line card already provisioned"); err = -EBUSY; /* Check if the line card is provisioned in the same * way the user asks. In case it is, make the operation * to return success. */ if (ops->same_provision && ops->same_provision(linecard, linecard->priv, linecard_type->type, linecard_type->priv)) err = 0; goto out; } linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; linecard->type = linecard_type->type; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); err = ops->provision(linecard, linecard->priv, linecard_type->type, linecard_type->priv, extack); if (err) { /* Provisioning failed. Assume the linecard is unprovisioned * for future operations. */ mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } return err; out: mutex_unlock(&linecard->state_lock); return err; } static int devlink_linecard_type_unset(struct devlink_linecard *linecard, struct netlink_ext_ack *extack) { int err; mutex_lock(&linecard->state_lock); if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being provisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned"); err = -EBUSY; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); err = 0; goto out; } if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { NL_SET_ERR_MSG(extack, "Line card is not provisioned"); err = 0; goto out; } linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); err = linecard->ops->unprovision(linecard, linecard->priv, extack); if (err) { /* Unprovisioning failed. Assume the linecard is unprovisioned * for future operations. */ mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } return err; out: mutex_unlock(&linecard->state_lock); return err; } int devlink_nl_linecard_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_linecard *linecard; int err; linecard = devlink_linecard_get_from_info(devlink, info); if (IS_ERR(linecard)) return PTR_ERR(linecard); if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { const char *type; type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); if (strcmp(type, "")) { err = devlink_linecard_type_set(linecard, type, extack); if (err) return err; } else { err = devlink_linecard_type_unset(linecard, extack); if (err) return err; } } return 0; } static int devlink_linecard_types_init(struct devlink_linecard *linecard) { struct devlink_linecard_type *linecard_type; unsigned int count; int i; count = linecard->ops->types_count(linecard, linecard->priv); linecard->types = kmalloc_array(count, sizeof(*linecard_type), GFP_KERNEL); if (!linecard->types) return -ENOMEM; linecard->types_count = count; for (i = 0; i < count; i++) { linecard_type = &linecard->types[i]; linecard->ops->types_get(linecard, linecard->priv, i, &linecard_type->type, &linecard_type->priv); } return 0; } static void devlink_linecard_types_fini(struct devlink_linecard *linecard) { kfree(linecard->types); } /** * devl_linecard_create - Create devlink linecard * * @devlink: devlink * @linecard_index: driver-specific numerical identifier of the linecard * @ops: linecards ops * @priv: user priv pointer * * Create devlink linecard instance with provided linecard index. * Caller can use any indexing, even hw-related one. * * Return: Line card structure or an ERR_PTR() encoded error code. */ struct devlink_linecard * devl_linecard_create(struct devlink *devlink, unsigned int linecard_index, const struct devlink_linecard_ops *ops, void *priv) { struct devlink_linecard *linecard; int err; if (WARN_ON(!ops || !ops->provision || !ops->unprovision || !ops->types_count || !ops->types_get)) return ERR_PTR(-EINVAL); if (devlink_linecard_index_exists(devlink, linecard_index)) return ERR_PTR(-EEXIST); linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); if (!linecard) return ERR_PTR(-ENOMEM); linecard->devlink = devlink; linecard->index = linecard_index; linecard->ops = ops; linecard->priv = priv; linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; mutex_init(&linecard->state_lock); err = devlink_linecard_types_init(linecard); if (err) { mutex_destroy(&linecard->state_lock); kfree(linecard); return ERR_PTR(err); } list_add_tail(&linecard->list, &devlink->linecard_list); devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); return linecard; } EXPORT_SYMBOL_GPL(devl_linecard_create); /** * devl_linecard_destroy - Destroy devlink linecard * * @linecard: devlink linecard */ void devl_linecard_destroy(struct devlink_linecard *linecard) { devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); list_del(&linecard->list); devlink_linecard_types_fini(linecard); mutex_destroy(&linecard->state_lock); kfree(linecard); } EXPORT_SYMBOL_GPL(devl_linecard_destroy); /** * devlink_linecard_provision_set - Set provisioning on linecard * * @linecard: devlink linecard * @type: linecard type * * This is either called directly from the provision() op call or * as a result of the provision() op call asynchronously. */ void devlink_linecard_provision_set(struct devlink_linecard *linecard, const char *type) { mutex_lock(&linecard->state_lock); WARN_ON(linecard->type && strcmp(linecard->type, type)); linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; linecard->type = type; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); /** * devlink_linecard_provision_clear - Clear provisioning on linecard * * @linecard: devlink linecard * * This is either called directly from the unprovision() op call or * as a result of the unprovision() op call asynchronously. */ void devlink_linecard_provision_clear(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); /** * devlink_linecard_provision_fail - Fail provisioning on linecard * * @linecard: devlink linecard * * This is either called directly from the provision() op call or * as a result of the provision() op call asynchronously. */ void devlink_linecard_provision_fail(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); /** * devlink_linecard_activate - Set linecard active * * @linecard: devlink linecard */ void devlink_linecard_activate(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_activate); /** * devlink_linecard_deactivate - Set linecard inactive * * @linecard: devlink linecard */ void devlink_linecard_deactivate(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); switch (linecard->state) { case DEVLINK_LINECARD_STATE_ACTIVE: linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); break; case DEVLINK_LINECARD_STATE_UNPROVISIONING: /* Line card is being deactivated as part * of unprovisioning flow. */ break; default: WARN_ON(1); break; } mutex_unlock(&linecard->state_lock); } EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); static void devlink_linecard_rel_notify_cb(struct devlink *devlink, u32 linecard_index) { struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (!linecard) return; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); } static void devlink_linecard_rel_cleanup_cb(struct devlink *devlink, u32 linecard_index, u32 rel_index) { struct devlink_linecard *linecard; linecard = devlink_linecard_get_by_index(devlink, linecard_index); if (linecard && linecard->rel_index == rel_index) linecard->rel_index = 0; } /** * devlink_linecard_nested_dl_set - Attach/detach nested devlink * instance to linecard. * * @linecard: devlink linecard * @nested_devlink: devlink instance to attach or NULL to detach */ int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, struct devlink *nested_devlink) { return devlink_rel_nested_in_add(&linecard->rel_index, linecard->devlink->index, linecard->index, devlink_linecard_rel_notify_cb, devlink_linecard_rel_cleanup_cb, nested_devlink); } EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
987 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H #ifdef CONFIG_HUGETLB_PAGE #include <linux/mm.h> static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return !!(vma->vm_flags & VM_HUGETLB); } #else static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return false; } #endif #endif
14928 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 /* SPDX-License-Identifier: GPL-2.0 */ /* * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.rst for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H #include <linux/lockdep_types.h> #include <linux/smp.h> #include <asm/percpu.h> struct task_struct; #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> #include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { int i; *to = *from; /* * Since the class cache can be modified concurrently we could observe * half pointers (64bit arch using 32bit copy insns). Therefore clear * the caches and take the performance hit. * * XXX it doesn't work well with lockdep_set_class_and_subclass(), since * that relies on cache abuse. */ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) to->class_cache[i] = NULL; } /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: */ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; const struct lock_trace *trace; u16 distance; /* bitmap of different dependencies from head to this */ u8 dep; /* used by BFS to record whether "prev -> this" only has -(*R)-> */ u8 only_xr; /* * The parent field is used to implement breadth-first search, and the * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; /** * struct lock_chain - lock dependency chain record * * @irq_context: the same as irq_context in held_lock below * @depth: the number of held locks in this chain * @base: the index in chain_hlocks for this chain * @entry: the collided lock chains in lock_chain hash list * @chain_key: the hash key of this lock_chain */ struct lock_chain { /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; #define MAX_LOCKDEP_KEYS_BITS 13 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) #define INITIAL_CHAIN_KEY -1 struct held_lock { /* * One-way hash of the dependency chain up to this point. We * hash the hashes step by step as the dependency chain grows. * * We use it for dependency-caching and we skip detection * passes and dependency-updates if there is a cache-hit, so * it is absolutely critical for 100% coverage of the validator * to have a unique key value for every unique dependency path * that can occur in the system, to make a unique hash value * as likely as possible - hence the 64-bit width. * * The task struct holds the current hash value (initialized * with zero), here we store the previous hash value: */ u64 prev_chain_key; unsigned long acquire_ip; struct lockdep_map *instance; struct lockdep_map *nest_lock; #ifdef CONFIG_LOCK_STAT u64 waittime_stamp; u64 holdtime_stamp; #endif /* * class_idx is zero-indexed; it points to the element in * lock_classes this held lock instance belongs to. class_idx is in * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. */ unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; /* * The lock-stack is unified in that the lock chains of interrupt * contexts nest ontop of process context chains, but we 'separate' * the hashes by starting with 0 if we cross into an interrupt * context, and we also keep do not add cross-context lock * dependencies - the lock usage graph walking covers that area * anyway, and we'd just unnecessarily increase the number of * dependencies otherwise. [Note: hardirq and softirq contexts * are separated from each other too.] * * The following field is used to detect when we cross into an * interrupt context: */ unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ unsigned int trylock:1; /* 16 bits */ unsigned int read:2; /* see lock_acquire() comment */ unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; unsigned int sync:1; unsigned int references:11; /* 32 bits */ unsigned int pin_count; }; /* * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); /* * Split the recursion counter in two to readily detect 'off' vs recursion. */ #define LOCKDEP_RECURSION_BITS 16 #define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) #define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) /* * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due * to header dependencies. */ #define lockdep_off() \ do { \ current->lockdep_recursion += LOCKDEP_OFF; \ } while (0) #define lockdep_on() \ do { \ current->lockdep_recursion -= LOCKDEP_OFF; \ } while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); /* * These methods are used by specific locking variants (spinlocks, * rwlocks, mutexes and rwsems) to pass init/acquire/release events * to lockdep: */ extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type); static inline void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer) { lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL); } static inline void lockdep_init_map_wait(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner) { lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV); } static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV); } /* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) * or they are too narrow (they suffer from a false class-split): */ #define lockdep_set_class(lock, key) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_name(lock, key, name) \ lockdep_init_map_type(&(lock)->dep_map, name, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_subclass(lock, key, sub) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, sub, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_subclass(lock, sub) \ lockdep_init_map_type(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /* * Compare locking classes */ #define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key) static inline int lockdep_match_key(struct lockdep_map *lock, struct lock_class_key *key) { return lock->key == key; } /* * Acquire a lock. * * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed * * Values for check: * * 0: simple checks (freeing, held-at-exit-time, etc.) * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); extern void lock_release(struct lockdep_map *lock, unsigned long ip); extern void lock_sync(struct lockdep_map *lock, unsigned int subclass, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); /* lock_is_held_type() returns */ #define LOCK_STATE_UNKNOWN -1 #define LOCK_STATE_NOT_HELD 0 #define LOCK_STATE_HELD 1 /* * Same "read" as for lock_acquire(), except -1 means any. */ extern int lock_is_held_type(const struct lockdep_map *lock, int read); static inline int lock_is_held(const struct lockdep_map *lock) { return lock_is_held_type(lock, -1); } #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) #define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); #define lock_set_novalidate_class(l, n, i) \ lock_set_class(l, n, &__lockdep_no_validate__, 0, i) static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { lock_set_class(lock, lock->name, lock->key, subclass, ip); } extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert(cond) \ do { WARN_ON(debug_locks && !(cond)); } while (0) #define lockdep_assert_once(cond) \ do { WARN_ON_ONCE(debug_locks && !(cond)); } while (0) #define lockdep_assert_held(l) \ lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD) #define lockdep_assert_not_held(l) \ lockdep_assert(lockdep_is_held(l) != LOCK_STATE_HELD) #define lockdep_assert_held_write(l) \ lockdep_assert(lockdep_is_held_type(l, 0)) #define lockdep_assert_held_read(l) \ lockdep_assert(lockdep_is_held_type(l, 1)) #define lockdep_assert_held_once(l) \ lockdep_assert_once(lockdep_is_held(l) != LOCK_STATE_NOT_HELD) #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) #define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) /* * Must use lock_map_aquire_try() with override maps to avoid * lockdep thinking they participate in the block chain. */ #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map _name = { \ .name = #_name "-wait-type-override", \ .wait_type_inner = _wait_type, \ .lock_type = LD_LOCK_WAIT_OVERRIDE, } #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) { } static inline void lockdep_off(void) { } static inline void lockdep_on(void) { } static inline void lockdep_set_selftest_task(struct task_struct *task) { } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, key, s, i) do { (void)(key); } while (0) # define lock_set_novalidate_class(l, n, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_wait(lock, name, key, sub, inner) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather * #ifdef the call himself. */ # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) static inline void lockdep_register_key(struct lock_class_key *key) { } static inline void lockdep_unregister_key(struct lock_class_key *key) { } #define lockdep_depth(tsk) (0) /* * Dummy forward declarations, allow users to write less ifdef-y code * and depend on dead code elimination. */ extern int lock_is_held(const void *); extern int lockdep_is_held(const void *); #define lockdep_is_held_type(l, r) (1) #define lockdep_assert(c) do { } while (0) #define lockdep_assert_once(c) do { } while (0) #define lockdep_assert_held(l) do { (void)(l); } while (0) #define lockdep_assert_not_held(l) do { (void)(l); } while (0) #define lockdep_assert_held_write(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) #define lockdep_recursing(tsk) (0) #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} #endif /* !LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING void lockdep_set_lock_cmp_fn(struct lockdep_map *, lock_cmp_fn, lock_print_fn); #define lock_set_cmp_fn(lock, ...) lockdep_set_lock_cmp_fn(&(lock)->dep_map, __VA_ARGS__) #else #define lock_set_cmp_fn(lock, ...) do { } while (0) #endif enum xhlock_context_t { XHLOCK_HARD, XHLOCK_SOFT, XHLOCK_CTX_NR, }; /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. */ #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ ({ \ int ____err = 0; \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ____err = lock(_lock); \ } \ if (!____err) \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ ____err; \ }) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) #define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ lock(_lock) #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) { } #endif /* Variable used to make lockdep treat read_lock() as recursive in selftests */ #ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS extern unsigned int force_read_lock_recursive; #else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #define force_read_lock_recursive 0 #endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #ifdef CONFIG_LOCKDEP extern bool read_lock_is_recursive(void); #else /* CONFIG_LOCKDEP */ /* If !LOCKDEP, the value is meaningless */ #define read_lock_is_recursive() 0 #endif /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels * of nesting should define their own lock-nesting subclasses.) */ #define SINGLE_DEPTH_NESTING 1 /* * Map the dependency ops to NOP or to real lockdep ops, depending * on the per lock-class debug mode: */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) \ do { \ if (read_lock_is_recursive()) \ lock_acquire_shared_recursive(l, s, t, NULL, i); \ else \ lock_acquire_shared(l, s, t, NULL, i); \ } while (0) #define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define seqcount_release(l, i) lock_release(l, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define mutex_release(l, i) lock_release(l, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) #define lock_map_sync(l) lock_sync(l, 0, 0, 1, NULL, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_nested(lock, subclass) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL, \ _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); DECLARE_PER_CPU(unsigned int, lockdep_recursion); #define __lockdep_enabled (debug_locks && !this_cpu_read(lockdep_recursion)) #define lockdep_assert_irqs_enabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_no_hardirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && (this_cpu_read(hardirq_context) || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() != 0 || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_disabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() == 0 && \ this_cpu_read(hardirqs_enabled))); \ } while (0) /* * Acceptable for protecting per-CPU resources accessed from BH. * Much like in_softirq() - semantics are ambiguous, use carefully. */ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ (!in_softirq() || in_irq() || in_nmi())); \ } while (0) #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) # define lockdep_assert_no_hardirq() do { } while (0) # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) # define lockdep_assert_in_softirq() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) #else # define lockdep_assert_RT_in_threaded_ctx() do { } while (0) #endif #ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); #else static inline void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { } #endif #endif /* __LINUX_LOCKDEP_H */
40 24 40 107 102 102 102 102 102 102 107 99 103 99 4 103 107 107 107 107 1 107 107 107 107 107 107 107 107 5 4 106 4 103 99 4 4 103 103 33 105 106 1 1 140 140 130 131 109 109 108 3 3 2 2 2 2 2 2 2 8 6 5 3 116 108 2 107 143 108 108 108 108 107 3 3 107 107 107 107 106 106 106 43 142 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 // SPDX-License-Identifier: GPL-2.0-or-later /* AF_RXRPC sendmsg() implementation. * * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/gfp.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" /* * Propose an abort to be made in the I/O thread. */ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, enum rxrpc_abort_reason why) { _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); if (!call->send_abort && !rxrpc_call_is_complete(call)) { call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); return true; } return false; } /* * Wait for a call to become connected. Interruption here doesn't cause the * call to be aborted. */ static int rxrpc_wait_to_be_connected(struct rxrpc_call *call, long *timeo) { DECLARE_WAITQUEUE(myself, current); int ret = 0; _enter("%d", call->debug_id); if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) goto no_wait; add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { switch (call->interruptibility) { case RXRPC_INTERRUPTIBLE: case RXRPC_PREINTERRUPTIBLE: set_current_state(TASK_INTERRUPTIBLE); break; case RXRPC_UNINTERRUPTIBLE: default: set_current_state(TASK_UNINTERRUPTIBLE); break; } if (rxrpc_call_state(call) != RXRPC_CALL_CLIENT_AWAIT_CONN) break; if ((call->interruptibility == RXRPC_INTERRUPTIBLE || call->interruptibility == RXRPC_PREINTERRUPTIBLE) && signal_pending(current)) { ret = sock_intr_errno(*timeo); break; } *timeo = schedule_timeout(*timeo); } remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); no_wait: if (ret == 0 && rxrpc_call_is_complete(call)) ret = call->error; _leave(" = %d", ret); return ret; } /* * Return true if there's sufficient Tx queue space. */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { if (_tx_win) *_tx_win = call->tx_bottom; return call->tx_prepared - call->tx_bottom < 256; } /* * Wait for space to appear in the Tx queue or a signal to occur. */ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (rxrpc_check_tx_space(call, NULL)) return 0; if (rxrpc_call_is_complete(call)) return call->error; if (signal_pending(current)) return sock_intr_errno(*timeo); trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } /* * Wait for space to appear in the Tx queue uninterruptibly, but with * a timeout of 2*RTT if no progress was made and a signal occurred. */ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, struct rxrpc_call *call) { rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; rtt = READ_ONCE(call->peer->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; timeout = rtt; tx_start = smp_load_acquire(&call->acks_hard_ack); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (rxrpc_check_tx_space(call, &tx_win)) return 0; if (rxrpc_call_is_complete(call)) return call->error; if (timeout == 0 && tx_win == tx_start && signal_pending(current)) return -EINTR; if (tx_win != tx_start) { timeout = rtt; tx_start = tx_win; } trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); timeout = schedule_timeout(timeout); } } /* * Wait for space to appear in the Tx queue uninterruptibly. */ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo) { for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (rxrpc_check_tx_space(call, NULL)) return 0; if (rxrpc_call_is_complete(call)) return call->error; trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } /* * wait for space to appear in the transmit/ACK window * - caller holds the socket locked */ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, struct rxrpc_call *call, long *timeo, bool waitall) { DECLARE_WAITQUEUE(myself, current); int ret; _enter(",{%u,%u,%u,%u}", call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); switch (call->interruptibility) { case RXRPC_INTERRUPTIBLE: if (waitall) ret = rxrpc_wait_for_tx_window_waitall(rx, call); else ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); break; case RXRPC_PREINTERRUPTIBLE: case RXRPC_UNINTERRUPTIBLE: default: ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo); break; } remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); _leave(" = %d", ret); return ret; } /* * Notify the owner of the call that the transmit phase is ended and the last * packet has been queued. */ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call, rxrpc_notify_end_tx_t notify_end_tx) { if (notify_end_tx) notify_end_tx(&rx->sk, call, call->user_call_ID); } /* * Queue a DATA packet for transmission, set the resend timeout and send * the packet immediately. Returns the error from rxrpc_send_data_packet() * in case the caller wants to do something with it. */ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { rxrpc_seq_t seq = txb->seq; bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags), poke; rxrpc_inc_stat(call->rxnet, stat_tx_data); ASSERTCMP(txb->seq, ==, call->tx_prepared + 1); /* We have to set the timestamp before queueing as the retransmit * algorithm can see the packet as soon as we queue it. */ txb->last_sent = ktime_get_real(); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); /* Add the packet to the call's output buffer */ spin_lock(&call->tx_lock); poke = list_empty(&call->tx_sendmsg); list_add_tail(&txb->call_link, &call->tx_sendmsg); call->tx_prepared = seq; if (last) rxrpc_notify_end_tx(rx, call, notify_end_tx); spin_unlock(&call->tx_lock); if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } /* * send data through a socket * - must be called in process context * - The caller holds the call user access mutex, but not the socket lock. */ static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx, bool *_dropped_lock) { struct rxrpc_txbuf *txb; struct sock *sk = &rx->sk; enum rxrpc_call_state state; long timeo; bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); if (ret < 0) return ret; if (call->conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = rxrpc_init_client_conn_security(call->conn); if (ret < 0) return ret; } /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); reload: ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; state = rxrpc_call_state(call); ret = -ESHUTDOWN; if (state >= RXRPC_CALL_COMPLETE) goto maybe_error; ret = -EPROTO; if (state != RXRPC_CALL_CLIENT_SEND_REQUEST && state != RXRPC_CALL_SERVER_ACK_REQUEST && state != RXRPC_CALL_SERVER_SEND_REPLY) { /* Request phase complete for this client call */ trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, call->cid, call->call_id, call->rx_consumed, 0, -EPROTO); goto maybe_error; } ret = -EMSGSIZE; if (call->tx_total_len != -1) { if (len - copied > call->tx_total_len) goto maybe_error; if (!more && len - copied != call->tx_total_len) goto maybe_error; } txb = call->tx_pending; call->tx_pending = NULL; if (txb) rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); do { if (!txb) { size_t remain, bufsize, chunk, offset; _debug("alloc"); if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. */ remain = more ? INT_MAX : msg_data_left(msg); ret = call->conn->security->how_much_data(call, remain, &bufsize, &chunk, &offset); if (ret < 0) goto maybe_error; _debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset); /* create a buffer that we can retain until it's ACK'd */ ret = -ENOMEM; txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_DATA, GFP_KERNEL); if (!txb) goto maybe_error; txb->offset = offset; txb->space -= offset; txb->space = min_t(size_t, chunk, txb->space); } _debug("append"); /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); _debug("add %zu", copy); if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); txb->space -= copy; txb->len += copy; txb->offset += copy; copied += copy; if (call->tx_total_len != -1) call->tx_total_len -= copy; } /* check for the far side aborting the call or a network error * occurring */ if (rxrpc_call_is_complete(call)) goto call_terminated; /* add the packet to the send queue if it's now full */ if (!txb->space || (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) { txb->wire.flags |= RXRPC_LAST_PACKET; __set_bit(RXRPC_TXBUF_LAST, &txb->flags); } else if (call->tx_top - call->acks_hard_ack < call->tx_winsize) txb->wire.flags |= RXRPC_MORE_PACKETS; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } } while (msg_data_left(msg) > 0); success: ret = copied; if (rxrpc_call_is_complete(call) && call->error < 0) ret = call->error; out: call->tx_pending = txb; _leave(" = %d", ret); return ret; call_terminated: rxrpc_put_txbuf(txb, rxrpc_txbuf_put_send_aborted); _leave(" = %d", call->error); return call->error; maybe_error: if (copied) goto success; goto out; efault: ret = -EFAULT; goto out; wait_for_space: ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; mutex_unlock(&call->user_mutex); *_dropped_lock = true; ret = rxrpc_wait_for_tx_window(rx, call, &timeo, msg->msg_flags & MSG_WAITALL); if (ret < 0) goto maybe_error; if (call->interruptibility == RXRPC_INTERRUPTIBLE) { if (mutex_lock_interruptible(&call->user_mutex) < 0) { ret = sock_intr_errno(timeo); goto maybe_error; } } else { mutex_lock(&call->user_mutex); } *_dropped_lock = false; goto reload; } /* * extract control messages from the sendmsg() control buffer */ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) { struct cmsghdr *cmsg; bool got_user_ID = false; int len; if (msg->msg_controllen == 0) return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; len = cmsg->cmsg_len - sizeof(struct cmsghdr); _debug("CMSG %d, %d, %d", cmsg->cmsg_level, cmsg->cmsg_type, len); if (cmsg->cmsg_level != SOL_RXRPC) continue; switch (cmsg->cmsg_type) { case RXRPC_USER_CALL_ID: if (msg->msg_flags & MSG_CMSG_COMPAT) { if (len != sizeof(u32)) return -EINVAL; p->call.user_call_ID = *(u32 *)CMSG_DATA(cmsg); } else { if (len != sizeof(unsigned long)) return -EINVAL; p->call.user_call_ID = *(unsigned long *) CMSG_DATA(cmsg); } got_user_ID = true; break; case RXRPC_ABORT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; p->command = RXRPC_CMD_SEND_ABORT; if (len != sizeof(p->abort_code)) return -EINVAL; p->abort_code = *(unsigned int *)CMSG_DATA(cmsg); if (p->abort_code == 0) return -EINVAL; break; case RXRPC_CHARGE_ACCEPT: if (p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; p->command = RXRPC_CMD_CHARGE_ACCEPT; if (len != 0) return -EINVAL; break; case RXRPC_EXCLUSIVE_CALL: p->exclusive = true; if (len != 0) return -EINVAL; break; case RXRPC_UPGRADE_SERVICE: p->upgrade = true; if (len != 0) return -EINVAL; break; case RXRPC_TX_LENGTH: if (p->call.tx_total_len != -1 || len != sizeof(__s64)) return -EINVAL; p->call.tx_total_len = *(__s64 *)CMSG_DATA(cmsg); if (p->call.tx_total_len < 0) return -EINVAL; break; case RXRPC_SET_CALL_TIMEOUT: if (len & 3 || len < 4 || len > 12) return -EINVAL; memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len); p->call.nr_timeouts = len / 4; if (p->call.timeouts.hard > INT_MAX / HZ) return -ERANGE; if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000) return -ERANGE; if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000) return -ERANGE; break; default: return -EINVAL; } } if (!got_user_ID) return -EINVAL; if (p->call.tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA) return -EINVAL; _leave(" = 0"); return 0; } /* * Create a new client call for sendmsg(). * - Called with the socket lock held, which it must release. * - If it returns a call, the call's lock will need releasing by the caller. */ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) __releases(&rx->sk.sk_lock.slock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; struct key *key; DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); _enter(""); if (!msg->msg_name) { release_sock(&rx->sk); return ERR_PTR(-EDESTADDRREQ); } key = rx->key; if (key && !rx->key->payload.data[0]) key = NULL; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.key = rx->key; cp.security_level = rx->min_sec_level; cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ _leave(" = %p\n", call); return call; } /* * send a message forming part of a client call through an RxRPC socket * - caller holds the socket locked * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; unsigned long now, j; bool dropped_lock = false; int ret; struct rxrpc_send_params p = { .call.tx_total_len = -1, .call.user_call_ID = 0, .call.nr_timeouts = 0, .call.interruptibility = RXRPC_INTERRUPTIBLE, .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, .upgrade = false, }; _enter(""); ret = rxrpc_sendmsg_cmsg(msg, &p); if (ret < 0) goto error_release_sock; if (p.command == RXRPC_CMD_CHARGE_ACCEPT) { ret = -EINVAL; if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) goto error_release_sock; ret = rxrpc_user_charge_accept(rx, p.call.user_call_ID); goto error_release_sock; } call = rxrpc_find_call_by_user_ID(rx, p.call.user_call_ID); if (!call) { ret = -EBADSLT; if (p.command != RXRPC_CMD_SEND_DATA) goto error_release_sock; call = rxrpc_new_client_call_for_sendmsg(rx, msg, &p); /* The socket is now unlocked... */ if (IS_ERR(call)) return PTR_ERR(call); /* ... and we have the call lock. */ p.call.nr_timeouts = 0; ret = 0; if (rxrpc_call_is_complete(call)) goto out_put_unlock; } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: case RXRPC_CALL_SERVER_SECURING: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; case RXRPC_CALL_UNINITIALISED: case RXRPC_CALL_SERVER_PREALLOC: rxrpc_put_call(call, rxrpc_call_put_sendmsg); ret = -EBUSY; goto error_release_sock; default: break; } ret = mutex_lock_interruptible(&call->user_mutex); release_sock(&rx->sk); if (ret < 0) { ret = -ERESTARTSYS; goto error_put; } if (p.call.tx_total_len != -1) { ret = -EINVAL; if (call->tx_total_len != -1 || call->tx_pending || call->tx_top != 0) goto out_put_unlock; call->tx_total_len = p.call.tx_total_len; } } switch (p.call.nr_timeouts) { case 3: j = msecs_to_jiffies(p.call.timeouts.normal); if (p.call.timeouts.normal > 0 && j == 0) j = 1; WRITE_ONCE(call->next_rx_timo, j); fallthrough; case 2: j = msecs_to_jiffies(p.call.timeouts.idle); if (p.call.timeouts.idle > 0 && j == 0) j = 1; WRITE_ONCE(call->next_req_timo, j); fallthrough; case 1: if (p.call.timeouts.hard > 0) { j = p.call.timeouts.hard * HZ; now = jiffies; j += now; WRITE_ONCE(call->expect_term_by, j); rxrpc_reduce_call_timer(call, j, now, rxrpc_timer_set_for_hard); } break; } if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; } else if (p.command == RXRPC_CMD_SEND_ABORT) { rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; } else if (p.command != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; } else { ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); } out_put_unlock: if (!dropped_lock) mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put_sendmsg); _leave(" = %d", ret); return ret; error_release_sock: release_sock(&rx->sk); return ret; } /** * rxrpc_kernel_send_data - Allow a kernel service to send data on a call * @sock: The socket the call is on * @call: The call to send data through * @msg: The data to send * @len: The amount of data to send * @notify_end_tx: Notification that the last packet is queued. * * Allow a kernel service to send data on a call. The call must be in an state * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx) { bool dropped_lock = false; int ret; _enter("{%d},", call->debug_id); ASSERTCMP(msg->msg_name, ==, NULL); ASSERTCMP(msg->msg_control, ==, NULL); mutex_lock(&call->user_mutex); ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, notify_end_tx, &dropped_lock); if (ret == -ESHUTDOWN) ret = call->error; if (!dropped_lock) mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } EXPORT_SYMBOL(rxrpc_kernel_send_data); /** * rxrpc_kernel_abort_call - Allow a kernel service to abort a call * @sock: The socket the call is on * @call: The call to be aborted * @abort_code: The abort code to stick into the ABORT packet * @error: Local error value * @why: Indication as to why. * * Allow a kernel service to abort a call, if it's still in an abortable state * and return true if the call was aborted, false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) { bool aborted; _enter("{%d},%d,%d,%u", call->debug_id, abort_code, error, why); mutex_lock(&call->user_mutex); aborted = rxrpc_propose_abort(call, abort_code, error, why); mutex_unlock(&call->user_mutex); return aborted; } EXPORT_SYMBOL(rxrpc_kernel_abort_call); /** * rxrpc_kernel_set_tx_length - Set the total Tx length on a call * @sock: The socket the call is on * @call: The call to be informed * @tx_total_len: The amount of data to be transmitted for this call * * Allow a kernel service to set the total transmit length on a call. This * allows buffer-to-packet encrypt-and-copy to be performed. * * This function is primarily for use for setting the reply length since the * request length can be set when beginning the call. */ void rxrpc_kernel_set_tx_length(struct socket *sock, struct rxrpc_call *call, s64 tx_total_len) { WARN_ON(call->tx_total_len != -1); call->tx_total_len = tx_total_len; } EXPORT_SYMBOL(rxrpc_kernel_set_tx_length);
1618 1620 1619 1617 1619 1618 1613 1616 431 288 288 286 1525 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/vmalloc.h> #include <linux/reboot.h> #define CREATE_TRACE_POINTS #include <trace/events/notifier.h> /* * Notifier list for kernel code which wants to be called * at shutdown. This is used to stop any idling DMA operations * and the like. */ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); /* * Notifier chain core routines. The exported routines below * are layered on top of these, with appropriate locking added. */ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n, bool unique_priority) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { WARN(1, "notifier callback %ps already registered", n->notifier_call); return -EEXIST; } if (n->priority > (*nl)->priority) break; if (n->priority == (*nl)->priority && unique_priority) return -EBUSY; nl = &((*nl)->next); } n->next = *nl; rcu_assign_pointer(*nl, n); trace_notifier_register((void *)n->notifier_call); return 0; } static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if ((*nl) == n) { rcu_assign_pointer(*nl, n->next); trace_notifier_unregister((void *)n->notifier_call); return 0; } nl = &((*nl)->next); } return -ENOENT; } /** * notifier_call_chain - Informs the registered notifiers about an event. * @nl: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: Number of notifier functions to be called. Don't care * value of this parameter is -1. * @nr_calls: Records the number of notifications sent. Don't care * value of this field is NULL. * Return: notifier_call_chain returns the value returned by the * last notifier function called. */ static int notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; nb = rcu_dereference_raw(*nl); while (nb && nr_to_call) { next_nb = rcu_dereference_raw(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { WARN(1, "Invalid notifier called!"); nb = next_nb; continue; } #endif trace_notifier_run((void *)nb->notifier_call); ret = nb->notifier_call(nb, val, v); if (nr_calls) (*nr_calls)++; if (ret & NOTIFY_STOP_MASK) break; nb = next_nb; nr_to_call--; } return ret; } NOKPROBE_SYMBOL(notifier_call_chain); /** * notifier_call_chain_robust - Inform the registered notifiers about an event * and rollback on error. * @nl: Pointer to head of the blocking notifier chain * @val_up: Value passed unmodified to the notifier function * @val_down: Value passed unmodified to the notifier function when recovering * from an error on @val_up * @v: Pointer passed unmodified to the notifier function * * NOTE: It is important the @nl chain doesn't change between the two * invocations of notifier_call_chain() such that we visit the * exact same notifier callbacks; this rules out any RCU usage. * * Return: the return value of the @val_up call. */ static int notifier_call_chain_robust(struct notifier_block **nl, unsigned long val_up, unsigned long val_down, void *v) { int ret, nr = 0; ret = notifier_call_chain(nl, val_up, v, -1, &nr); if (ret & NOTIFY_STOP_MASK) notifier_call_chain(nl, val_down, v, nr-1, NULL); return ret; } /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** * atomic_notifier_chain_register - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain. * * Returns 0 on success, %-EEXIST on error. */ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n, false); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); /** * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain if there is no other * notifier registered using the same priority. * * Returns 0 on success, %-EEXIST or %-EBUSY on error. */ int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n, true); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio); /** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an atomic notifier chain. * * Returns zero on success or %-ENOENT on failure. */ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. * This routine uses RCU to synchronize with changes to the chain. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; rcu_read_lock(); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); /** * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty * @nh: Pointer to head of the atomic notifier chain * * Checks whether notifier chain is empty. * * Returns true is notifier chain is empty, false otherwise. */ bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh) { return !rcu_access_pointer(nh->head); } /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. */ static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n, bool unique_priority) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n, unique_priority); down_write(&nh->rwsem); ret = notifier_chain_register(&nh->head, n, unique_priority); up_write(&nh->rwsem); return ret; } /** * blocking_notifier_chain_register - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to a blocking notifier chain. * Must be called in process context. * * Returns 0 on success, %-EEXIST on error. */ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n) { return __blocking_notifier_chain_register(nh, n, false); } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to an blocking notifier chain if there is no other * notifier registered using the same priority. * * Returns 0 on success, %-EEXIST or %-EBUSY on error. */ int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh, struct notifier_block *n) { return __blocking_notifier_chain_register(nh, n, true); } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio); /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a blocking notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_unregister(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust); /** * blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* * Raw notifier chain routines. There is no protection; * the caller must provide it. Use at your own risk! */ /** * raw_notifier_chain_register - Add notifier to a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: New entry in notifier chain * * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. * * Returns 0 on success, %-EEXIST on error. */ int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_register(&nh->head, n, false); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); /** * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a raw notifier chain. * All locking must be provided by the caller. * * Returns zero on success or %-ENOENT on failure. */ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_unregister(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v) { return notifier_call_chain_robust(&nh->head, val_up, val_down, v); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust); /** * raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. * All locking must be provided by the caller. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { return notifier_call_chain(&nh->head, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). */ /** * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: New entry in notifier chain * * Adds a notifier to an SRCU notifier chain. * Must be called in process context. * * Returns 0 on success, %-EEXIST on error. */ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n, false); mutex_lock(&nh->mutex); ret = notifier_chain_register(&nh->head, n, false); mutex_unlock(&nh->mutex); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); /** * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an SRCU notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_unregister(&nh->head, n); mutex_unlock(&nh->mutex); synchronize_srcu(&nh->srcu); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** * srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); srcu_read_unlock(&nh->srcu, idx); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** * srcu_init_notifier_head - Initialize an SRCU notifier head * @nh: Pointer to head of the srcu notifier chain * * Unlike other sorts of notifier heads, SRCU notifier heads require * dynamic initialization. Be sure to call this routine before * calling any of the other SRCU notifier routines for this head. * * If an SRCU notifier head is deallocated, it must first be cleaned * up by calling srcu_cleanup_notifier_head(). Otherwise the head's * per-cpu data (used by the SRCU mechanism) will leak. */ void srcu_init_notifier_head(struct srcu_notifier_head *nh) { mutex_init(&nh->mutex); if (init_srcu_struct(&nh->srcu) < 0) BUG(); nh->head = NULL; } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { .regs = regs, .str = str, .err = err, .trapnr = trap, .signr = sig, }; RCU_LOCKDEP_WARN(!rcu_is_watching(), "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier);
1148 13 72 75 766 73 705 758 72 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 // SPDX-License-Identifier: GPL-2.0 /* * hrtimers - High-resolution kernel timers * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes * * Started by: Thomas Gleixner and Ingo Molnar */ #ifndef _LINUX_HRTIMER_H #define _LINUX_HRTIMER_H #include <linux/hrtimer_defs.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/list.h> #include <linux/percpu.h> #include <linux/seqlock.h> #include <linux/timer.h> #include <linux/timerqueue.h> struct hrtimer_clock_base; struct hrtimer_cpu_base; /* * Mode arguments of xxx_hrtimer functions: * * HRTIMER_MODE_ABS - Time value is absolute * HRTIMER_MODE_REL - Time value is relative to now * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered * when starting the timer) * HRTIMER_MODE_SOFT - Timer callback function will be executed in * soft irq context * HRTIMER_MODE_HARD - Timer callback function will be executed in * hard irq context even on PREEMPT_RT. */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, HRTIMER_MODE_REL = 0x01, HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_HARD = 0x08, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD, HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD, HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD, HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; /* * Return values for the callback function */ enum hrtimer_restart { HRTIMER_NORESTART, /* Timer is not restarted */ HRTIMER_RESTART, /* Timer must be restarted */ }; /* * Values to track state of the timer * * Possible states: * * 0x00 inactive * 0x01 enqueued into rbtree * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free * the timer from the callback function. * * Therefore we track the callback state in: * * timer->base->cpu_base->running == timer * * On SMP it is possible to have a "callback function running and enqueued" * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 /** * struct hrtimer - the basic hrtimer structure * @node: timerqueue node, which also manages node.expires, * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding * slack to the _softexpires value. For non range timers * identical to _softexpires. * @_softexpires: the absolute earliest expiry time of the hrtimer. * The time which was given as expiry time when the timer * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) * @is_rel: Set if the timer was armed relative * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. * * The hrtimer structure must be initialized by hrtimer_init() */ struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; u8 is_rel; u8 is_soft; u8 is_hard; }; /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure * @task: task to wake up * * task is set to NULL, when the timer expires. */ struct hrtimer_sleeper { struct hrtimer timer; struct task_struct *task; }; #ifdef CONFIG_64BIT # define __hrtimer_clock_base_align ____cacheline_aligned #else # define __hrtimer_clock_base_align #endif /** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base * @index: clock type index for per_cpu support when moving a * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; unsigned int index; clockid_t clockid; seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, HRTIMER_BASE_MONOTONIC_SOFT, HRTIMER_BASE_REALTIME_SOFT, HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, HRTIMER_MAX_CLOCK_BASES, }; /** * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @hres_active: State of high resolution mode * @in_hrtirq: hrtimer_interrupt() is currently executing * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. * @nr_events: Total number of hrtimer interrupt events * @nr_retries: Total number of hrtimer interrupt retries * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are * expired * @timer_waiters: A hrtimer_cancel() invocation waits for the timer * callback to finish. * @expires_next: absolute time of the next event, is required for remote * hrtimer enqueue; it is the total first expiry time (hard * and soft hrtimer are taken into account) * @next_timer: Pointer to the first expiring timer * @softirq_expires_next: Time to check, if soft queues needs also to be expired * @softirq_next_timer: Pointer to the first expiring softirq based timer * @clock_base: array of clock bases for this cpu * * Note: next_timer is just an optimization for __remove_hrtimer(). * Do not dereference the pointer because it is not reliable on * cross cpu removals. */ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; unsigned int hres_active : 1, in_hrtirq : 1, hang_detected : 1, softirq_activated : 1; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; unsigned short nr_hangs; unsigned int max_hang_time; #endif #ifdef CONFIG_PREEMPT_RT spinlock_t softirq_expiry_lock; atomic_t timer_waiters; #endif ktime_t expires_next; struct hrtimer *next_timer; ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; } ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = time; timer->_softexpires = time; } static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, delta); } static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); } static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) { timer->node.expires = tv64; timer->_softexpires = tv64; } static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = ktime_add_safe(timer->node.expires, time); timer->_softexpires = ktime_add_safe(timer->_softexpires, time); } static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) { timer->node.expires = ktime_add_ns(timer->node.expires, ns); timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); } static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) { return timer->node.expires; } static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) { return timer->node.expires; } static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) { return ktime_to_ns(timer->node.expires); } static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { return ktime_sub(timer->node.expires, timer->base->get_time()); } static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) { return timer->base->get_time(); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? timer->base->cpu_base->hres_active : 0; } #ifdef CONFIG_HIGH_RES_TIMERS struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC #endif static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) { ktime_t rem = ktime_sub(timer->node.expires, now); /* * Adjust relative timers for the extra we added in * hrtimer_start_range_ns() to prevent short timeouts. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) rem -= hrtimer_resolution; return rem; } static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { return __hrtimer_expires_remaining_adjusted(timer, timer->base->get_time()); } #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); extern void timerfd_resume(void); #else static inline void timerfd_clock_was_set(void) { } static inline void timerfd_resume(void) { } #endif DECLARE_PER_CPU(struct tick_device, tick_cpu_device); #ifdef CONFIG_PREEMPT_RT void hrtimer_cancel_wait_running(const struct hrtimer *timer); #else static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) { cpu_relax(); } #endif /* Exported timer functions: */ /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); extern void destroy_hrtimer_on_stack(struct hrtimer *timer); #else static inline void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode) { hrtimer_init(timer, which_clock, mode); } static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { hrtimer_init_sleeper(sl, clock_id, mode); } static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { hrtimer_start_range_ns(timer, tim, 0, mode); } extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); hrtimer_start_range_ns(timer, soft, delta, mode); } void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode); static inline void hrtimer_restart(struct hrtimer *timer) { hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read */ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { return __hrtimer_get_remaining(timer, false); } extern u64 hrtimer_get_next_event(void); extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); /** * hrtimer_is_queued - check, whether the timer is on one of the queues * @timer: Timer to check * * Returns: True if the timer is queued, false otherwise * * The function can be used lockless, but it gives only a current snapshot. */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { /* The READ_ONCE pairs with the update functions of timer->state */ return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); } /* * Helper function to check, whether the timer is running the callback * function */ static inline int hrtimer_callback_running(struct hrtimer *timer) { return timer->base->running == timer; } /* Forward a hrtimer so it expires after now: */ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); /** * hrtimer_forward_now - forward the timer expiry so it expires after now * @timer: hrtimer to forward * @interval: the interval to forward * * Forward the timer expiry so it will expire after the current time * of the hrtimer clock base. Returns the number of overruns. * * Can be safely called from the callback function of @timer. If * called from other contexts @timer must neither be enqueued nor * running the callback and the caller needs to take care of * serialization. * * Note: This only updates the timer expiry value and does not requeue * the timer. */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { return hrtimer_forward(timer, timer->base->get_time(), interval); } /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid); extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); /* Show pending timers: */ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_cpu_dying(unsigned int cpu); #else #define hrtimers_cpu_dying NULL #endif #endif
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pkt_sched.h> #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 #define SRVL_FLOW_ON 0x80 #define SRVL_SET_PIN 0x82 #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfsrvl *service = container_obj(layr); if (layr->up == NULL || layr->up->ctrlcmd == NULL) return; switch (ctrl) { case CAIF_CTRLCMD_INIT_RSP: service->open = true; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case CAIF_CTRLCMD_DEINIT_RSP: case CAIF_CTRLCMD_INIT_FAIL_RSP: service->open = false; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: if (phyid != service->dev_info.id) break; if (service->modem_flow_on) layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); service->phy_flow_on = false; break; case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: if (phyid != service->dev_info.id) return; if (service->modem_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->phy_flow_on = true; break; case CAIF_CTRLCMD_FLOW_OFF_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); } service->modem_flow_on = false; break; case CAIF_CTRLCMD_FLOW_ON_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->modem_flow_on = true; break; case _CAIF_CTRLCMD_PHYIF_DOWN_IND: /* In case interface is down, let's fake a remove shutdown */ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; break; } } static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) { struct cfsrvl *service = container_obj(layr); caif_assert(layr != NULL); caif_assert(layr->dn != NULL); caif_assert(layr->dn->transmit != NULL); if (!service->supports_flowctrl) return 0; switch (ctrl) { case CAIF_MODEMCMD_FLOW_ON_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } case CAIF_MODEMCMD_FLOW_OFF_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } default: break; } return -EINVAL; } static void cfsrvl_release(struct cflayer *layer) { struct cfsrvl *service = container_of(layer, struct cfsrvl, layer); kfree(service); } void cfsrvl_init(struct cfsrvl *service, u8 channel_id, struct dev_info *dev_info, bool supports_flowctrl) { caif_assert(offsetof(struct cfsrvl, layer) == 0); service->open = false; service->modem_flow_on = true; service->phy_flow_on = true; service->layer.id = channel_id; service->layer.ctrlcmd = cfservl_ctrlcmd; service->layer.modemcmd = cfservl_modemcmd; service->dev_info = *dev_info; service->supports_flowctrl = supports_flowctrl; service->release = cfsrvl_release; } bool cfsrvl_ready(struct cfsrvl *service, int *err) { if (!service->open) { *err = -ENOTCONN; return false; } return true; } u8 cfsrvl_getphyid(struct cflayer *layer) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id; } bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id == phyid; } void caif_free_client(struct cflayer *adap_layer) { struct cfsrvl *servl; if (adap_layer == NULL || adap_layer->dn == NULL) return; servl = container_obj(adap_layer->dn); servl->release(&servl->layer); } EXPORT_SYMBOL(caif_free_client); void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*hold)(struct cflayer *lyr), void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) return; service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } EXPORT_SYMBOL(caif_client_register_refcnt);
29 29 33 33 33 2 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 34 32 32 32 2 1 34 3 3 35 35 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <net/sock.h> #include <linux/netlink.h> #include <linux/sock_diag.h> #include <linux/netlink_diag.h> #include <linux/rhashtable.h> #include "af_netlink.h" static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->groups == NULL) return 0; return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups), nlk->groups); } static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); u32 flags = 0; if (nlk->cb_running) flags |= NDIAG_FLAG_CB_RUNNING; if (nlk_test_bit(RECV_PKTINFO, sk)) flags |= NDIAG_FLAG_PKTINFO; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) flags |= NDIAG_FLAG_BROADCAST_ERROR; if (nlk_test_bit(RECV_NO_ENOBUFS, sk)) flags |= NDIAG_FLAG_NO_ENOBUFS; if (nlk_test_bit(LISTEN_ALL_NSID, sk)) flags |= NDIAG_FLAG_LISTEN_ALL_NSID; if (nlk_test_bit(CAP_ACK, sk)) flags |= NDIAG_FLAG_CAP_ACK; return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) { struct nlmsghdr *nlh; struct netlink_diag_msg *rep; struct netlink_sock *nlk = nlk_sk(sk); nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->ndiag_family = AF_NETLINK; rep->ndiag_type = sk->sk_type; rep->ndiag_protocol = sk->sk_protocol; rep->ndiag_state = sk->sk_state; rep->ndiag_ino = sk_ino; rep->ndiag_portid = nlk->portid; rep->ndiag_dst_portid = nlk->dst_portid; rep->ndiag_dst_group = nlk->dst_group; sock_diag_save_cookie(sk, rep->ndiag_cookie); if ((req->ndiag_show & NDIAG_SHOW_GROUPS) && sk_diag_dump_groups(sk, skb)) goto out_nlmsg_trim; if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) && sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; if ((req->ndiag_show & NDIAG_SHOW_FLAGS) && sk_diag_put_flags(sk, skb)) goto out_nlmsg_trim; nlmsg_end(skb, nlh); return 0; out_nlmsg_trim: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { struct rhashtable_iter *hti = (void *)cb->args[2]; struct netlink_table *tbl = &nl_table[protocol]; struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; struct netlink_sock *nlsk; unsigned long flags; struct sock *sk; int num = 2; int ret = 0; req = nlmsg_data(cb->nlh); if (s_num > 1) goto mc_list; num--; if (!hti) { hti = kmalloc(sizeof(*hti), GFP_KERNEL); if (!hti) return -ENOMEM; cb->args[2] = (long)hti; } if (!s_num) rhashtable_walk_enter(&tbl->hash, hti); rhashtable_walk_start(hti); while ((nlsk = rhashtable_walk_next(hti))) { if (IS_ERR(nlsk)) { ret = PTR_ERR(nlsk); if (ret == -EAGAIN) { ret = 0; continue; } break; } sk = (struct sock *)nlsk; if (!net_eq(sock_net(sk), net)) continue; if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, sock_i_ino(sk)) < 0) { ret = 1; break; } } rhashtable_walk_stop(hti); if (ret) goto done; rhashtable_walk_exit(hti); num++; mc_list: read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &tbl->mc_list) { if (sk_hashed(sk)) continue; if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { num++; continue; } if (sk_diag_fill(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, __sock_i_ino(sk)) < 0) { ret = 1; break; } num++; } read_unlock_irqrestore(&nl_table_lock, flags); done: cb->args[0] = num; return ret; } static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_diag_req *req; int s_num = cb->args[0]; int err = 0; req = nlmsg_data(cb->nlh); if (req->sdiag_protocol == NDIAG_PROTO_ALL) { int i; for (i = cb->args[1]; i < MAX_LINKS; i++) { err = __netlink_diag_dump(skb, cb, i, s_num); if (err) break; s_num = 0; } cb->args[1] = i; } else { if (req->sdiag_protocol >= MAX_LINKS) return -ENOENT; err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } return err < 0 ? err : skb->len; } static int netlink_diag_dump_done(struct netlink_callback *cb) { struct rhashtable_iter *hti = (void *)cb->args[2]; if (cb->args[0] == 1) rhashtable_walk_exit(hti); kfree(hti); return 0; } static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct netlink_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = netlink_diag_dump, .done = netlink_diag_dump_done, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else return -EOPNOTSUPP; } static const struct sock_diag_handler netlink_diag_handler = { .family = AF_NETLINK, .dump = netlink_diag_handler_dump, }; static int __init netlink_diag_init(void) { return sock_diag_register(&netlink_diag_handler); } static void __exit netlink_diag_exit(void) { sock_diag_unregister(&netlink_diag_handler); } module_init(netlink_diag_init); module_exit(netlink_diag_exit); MODULE_DESCRIPTION("Netlink-based socket monitoring/diagnostic interface (sock_diag)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */);
2 2 2 87 85 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ /* * Handle hardware traps and faults. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> #include <linux/kgdb.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/kexec.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/timer.h> #include <linux/init.h> #include <linux/bug.h> #include <linux/nmi.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/realmode.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> #include <asm/alternative.h> #include <asm/fpu/xstate.h> #include <asm/vm86.h> #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> #endif #include <asm/proto.h> DECLARE_BITMAP(system_vectors, NR_VECTORS); __always_inline int is_valid_bugaddr(unsigned long addr) { if (addr < TASK_SIZE_MAX) return 0; /* * We got #UD, if the text isn't readable we'd have gotten * a different exception. */ return *(unsigned short *)addr == INSN_UD2; } static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) { if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. */ if (trapnr < X86_TRAP_UD) { if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) return 0; } } else if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, error_code, 0)) return 0; tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } else { if (fixup_vdso_exception(regs, trapnr, error_code, 0)) return 0; } /* * We want error_code and trap_nr set for userspace faults and * kernelspace faults which result in die(), but not * kernelspace faults which are fixed up. die() gives the * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; return -1; } static void show_signal(struct task_struct *tsk, int signr, const char *type, const char *desc, struct pt_regs *regs, long error_code) { if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), type, desc, regs->ip, regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); pr_cont("\n"); } } static void do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, int sicode, void __user *addr) { struct task_struct *tsk = current; if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; show_signal(tsk, signr, "trap ", str, regs, error_code); if (!sicode) force_sig(signr); else force_sig_fault(signr, sicode, addr); } NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, unsigned long trapnr, int signr, int sicode, void __user *addr) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); cond_local_irq_disable(regs); } } /* * Posix requires to provide the address of the faulting instruction for * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. * * This address is usually regs->ip, but when an uprobe moved the code out * of line then regs->ip points to the XOL code which would confuse * anything which analyzes the fault address vs. the unmodified binary. If * a trap happened in XOL code then uprobe maps regs->ip back to the * original instruction address. */ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) { return (void __user *)uprobe_get_trap_addr(regs); } DEFINE_IDTENTRY(exc_divide_error) { do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, FPE_INTDIV, error_get_trap_addr(regs)); } DEFINE_IDTENTRY(exc_overflow) { do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else static inline void handle_invalid_op(struct pt_regs *regs) #endif { do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, ILL_ILLOPN, error_get_trap_addr(regs)); } static noinstr bool handle_bug(struct pt_regs *regs) { bool handled = false; /* * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() * is a rare case that uses @regs without passing them to * irqentry_enter(). */ kmsan_unpoison_entry_regs(regs); if (!is_valid_bugaddr(regs->ip)) return handled; /* * All lies, just get the WARN/BUG out. */ instrumentation_begin(); /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { regs->ip += LEN_UD2; handled = true; } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); return handled; } DEFINE_IDTENTRY_RAW(exc_invalid_op) { irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such * handle it before exception entry to avoid recursive WARN * in case exception entry is the one triggering WARNs. */ if (!user_mode(regs) && handle_bug(regs)) return; state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) { do_error_trap(regs, 0, "coprocessor segment overrun", X86_TRAP_OLD_MF, SIGFPE, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) { do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) { do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) { do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, 0, NULL); } DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) { char *str = "alignment check"; if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; if (!user_mode(regs)) die("Split lock detected\n", regs, error_code); local_irq_enable(); if (handle_user_split_lock(regs, error_code)) goto out; do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, error_code, BUS_ADRALN, NULL); out: local_irq_disable(); } #ifdef CONFIG_VMAP_STACK __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, unsigned long fault_address, struct stack_info *info) { const char *name = stack_type_name(info->type); printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n", name, (void *)fault_address, info->begin, info->end); die("stack guard page", regs, 0); /* Be absolutely certain we don't return. */ panic("%s stack guard hit", name); } #endif /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the * SDM's warnings about double faults being unrecoverable, returning works as * expected. Presumably what the SDM actually means is that the CPU may get * the register state wrong on entry, so returning could be a bad idea. * * Various CPU engineers have promised that double faults due to an IRET fault * while the stack is read-only are, in fact, recoverable. * * On x86_32, this is entered through a task gate, and regs are synthesized * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. * * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs * to be read before doing anything else. */ DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; #ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); struct stack_info info; #endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; /* * If IRET takes a non-IST fault on the espfix64 stack, then we * end up promoting it to a doublefault. In that case, take * advantage of the fact that we're not using the normal (TSS.sp0) * stack right now. We can write a fake #GP(0) frame at TSS.sp0 * and then modify our own IRET frame so that, when we return, * we land directly at the #GP(0) vector with the stack already * set up according to its expectations. * * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the * ESPFIX64 stack. Copy it to the entry stack. This fills * in gpregs->ss through gpregs->ip. * */ gpregs->ip = p[0]; gpregs->cs = p[1]; gpregs->flags = p[2]; gpregs->sp = p[3]; gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * * We will enter general_protection with kernel GSBASE, * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; } #endif irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; #ifdef CONFIG_VMAP_STACK /* * If we overflow the stack into a guard page, the CPU will fail * to deliver #PF and will send #DF instead. Similarly, if we * take any non-IST exception while too close to the bottom of * the stack, the processor will get a page fault while * delivering the exception and will generate a double fault. * * According to the SDM (footnote in 6.15 under "Interrupt 14 - * Page-Fault Exception (#PF): * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a * double fault. * * The logic below has a small possibility of incorrectly diagnosing * some errors as stack overflows. For example, if the IDT or GDT * gets corrupted such that #GP delivery fails due to a bad descriptor * causing #GP and we hit this condition while CR2 coincidentally * points to the stack guard page, we'll think we overflowed the * stack. Given that we're going to panic one way or another * if this happens, this isn't necessarily worth fixing. * * If necessary, we could improve the test by only diagnosing * a stack overflow if the saved RSP points within 47 bytes of * the bottom of the stack: if RSP == tsk_stack + 48 and we * take an exception, the stack is already aligned and there * will be enough room SS, RSP, RFLAGS, CS, RIP, and a * possible error code, so a stack overflow would *not* double * fault. With any less space left, exception delivery could * fail, and, as a practical matter, we've overflowed the * stack even if the actual trigger for the double fault was * something else. */ if (get_stack_guard_info((void *)address, &info)) handle_stack_overflow(regs, address, &info); #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); panic("Machine halted."); instrumentation_end(); } DEFINE_IDTENTRY(exc_bounds) { if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) die("bounds", regs, 0); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); cond_local_irq_disable(regs); } enum kernel_gp_hint { GP_NO_HINT, GP_NON_CANONICAL, GP_CANONICAL }; /* * When an uncaught #GP occurs, try to determine the memory address accessed by * the instruction and return that address to the caller. Also, try to figure * out whether any part of the access to that address was non-canonical. */ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, unsigned long *addr) { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return GP_NO_HINT; ret = insn_decode_kernel(&insn, insn_buf); if (ret < 0) return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) return GP_NO_HINT; #ifdef CONFIG_X86_64 /* * Check that: * - the operand is not in the kernel half * - the last byte of the operand is not in the user canonical half */ if (*addr < ~__VIRTUAL_MASK && *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK) return GP_NON_CANONICAL; #endif return GP_CANONICAL; } #define GPFSTR "general protection fault" static bool fixup_iopl_exception(struct pt_regs *regs) { struct thread_struct *t = &current->thread; unsigned char byte; unsigned long ip; if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) return false; if (insn_get_effective_ip(regs, &ip)) return false; if (get_user(byte, (const char __user *)ip)) return false; if (byte != 0xfa && byte != 0xfb) return false; if (!t->iopl_warn && printk_ratelimit()) { pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", current->comm, task_pid_nr(current), ip); print_vma_addr(KERN_CONT " in ", ip); pr_cont("\n"); t->iopl_warn = 1; } regs->ip += 1; return true; } /* * The unprivileged ENQCMD instruction generates #GPs if the * IA32_PASID MSR has not been populated. If possible, populate * the MSR from a PASID previously allocated to the mm. */ static bool try_fixup_enqcmd_gp(void) { #ifdef CONFIG_IOMMU_SVA u32 pasid; /* * MSR_IA32_PASID is managed using XSAVE. Directly * writing to the MSR is only possible when fpregs * are valid and the fpstate is not. This is * guaranteed when handling a userspace exception * in *before* interrupts are re-enabled. */ lockdep_assert_irqs_disabled(); /* * Hardware without ENQCMD will not generate * #GPs that can be fixed up here. */ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ if (!mm_valid_pasid(current->mm)) return false; pasid = current->mm->pasid; /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. */ if (current->pasid_activated) return false; wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; #else return false; #endif } static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str, unsigned long address) { if (fixup_exception(regs, trapnr, error_code, address)) return true; current->thread.error_code = error_code; current->thread.trap_nr = trapnr; /* * To be potentially processing a kprobe fault and to trust the result * from kprobe_running(), we have to be non-preemptible. */ if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, trapnr)) return true; return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; } static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, unsigned long error_code, const char *str) { current->thread.error_code = error_code; current->thread.trap_nr = trapnr; show_signal(current, SIGSEGV, "", str, regs, error_code); force_sig(SIGSEGV); } DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; unsigned long gp_addr; if (user_mode(regs) && try_fixup_enqcmd_gp()) return; cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); local_irq_disable(); return; } if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); else hint = get_kernel_gp_address(regs, &gp_addr); if (hint != GP_NO_HINT) snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx", (hint == GP_NON_CANONICAL) ? "probably for non-canonical address" : "maybe for address", gp_addr); /* * KASAN is interested only in the non-canonical case, clear it * otherwise. */ if (hint != GP_NON_CANONICAL) gp_addr = 0; die_addr(desc, regs, error_code, gp_addr); exit: cond_local_irq_disable(regs); } static bool do_int3(struct pt_regs *regs) { int res; #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) return true; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) return true; #endif res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); return res == NOTIFY_STOP; } NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { if (do_int3(regs)) return; cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); } DEFINE_IDTENTRY_RAW(exc_int3) { /* * poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ if (poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() * and therefore can trigger INT3, hence poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. */ if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } } #ifdef CONFIG_X86_64 /* * Help handler running on a per-cpu (IST or entry trampoline) stack * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; } #ifdef CONFIG_AMD_MEM_ENCRYPT asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) { unsigned long sp, *stack; struct stack_info info; struct pt_regs *regs_ret; /* * In the SYSCALL entry path the RSP value comes from user-space - don't * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { sp = this_cpu_read(pcpu_hot.top_of_stack); goto sync; } /* * From here on the RSP value is trusted. Now check whether entry * happened from a safe stack. Not safe are the entry or unknown stacks, * use the fall-back stack instead in this case. */ sp = regs->sp; stack = (unsigned long *)sp; if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || info.type > STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2); sync: /* * Found a safe stack - switch to it as if the entry didn't happen via * IST stack. The code below only copies pt_regs, the real switch happens * in assembly code. */ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); regs_ret = (struct pt_regs *)sp; *regs_ret = *regs; return regs_ret; } #endif asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { struct pt_regs tmp, *new_stack; /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault * correctly, we want to move our stack frame to where it would * be had we entered directly on the entry stack (rather than * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(new_stack)); return new_stack; } #endif static bool is_sysenter_singlestep(struct pt_regs *regs) { /* * We don't try for precision here. If we're anywhere in the region of * code that can be single-stepped in the SYSENTER entry path, then * assume that this is a useless single-step trap due to SYSENTER * being invoked with TF set. (We don't know in advance exactly * which instructions will be hit because BTF could plausibly * be set.) */ #ifdef CONFIG_X86_32 return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) < (unsigned long)__end_SYSENTER_singlestep_region - (unsigned long)__begin_SYSENTER_singlestep_region; #elif defined(CONFIG_IA32_EMULATION) return (regs->ip - (unsigned long)entry_SYSENTER_compat) < (unsigned long)__end_entry_SYSENTER_compat - (unsigned long)entry_SYSENTER_compat; #else return false; #endif } static __always_inline unsigned long debug_read_clear_dr6(void) { unsigned long dr6; /* * The Intel SDM says: * * Certain debug exceptions may clear bits 0-3. The remaining * contents of the DR6 register are never cleared by the * processor. To avoid confusion in identifying debug * exceptions, debug handlers should clear the register before * returning to the interrupted task. * * Keep it simple: clear DR6 immediately. */ get_debugreg(dr6, 6); set_debugreg(DR6_RESERVED, 6); dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ return dr6; } /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore * it is possible to get a watchpoint trap here from inside the kernel. * However, the code in ./ptrace.c has ensured that the user can * only set watchpoints on userspace addresses. Therefore the in-kernel * watchpoint trap can only occur in code which is reading/writing * from user space. Such code must not hold kernel locks (since it * can equally take a page fault), therefore it is safe to call * force_sig_info even though that claims and releases locks. * * Code in ./signal.c ensures that the debug control register * is restored before we deliver any signal, and therefore that * user code runs with the correct debug control register even though * we clear it here. * * Being careful here means that we don't have to be as careful in a * lot of more complicated places (task switching can be a bit lazy * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) * * May run on IST stack. */ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) { /* * Notifiers will clear bits in @dr6 to indicate the event has been * consumed - hw_breakpoint_handler(), single_stop_cont(). * * Notifiers will set bits in @virtual_dr6 to indicate the desire * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). */ if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) return true; return false; } static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions * are exceedingly 'fun'. * * Since this function is NOKPROBE, and that also applies to * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a * HW_BREAKPOINT_W on our stack) * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* * If something gets miswired and we end up here for a user mode * #DB, we will malfunction. */ WARN_ON_ONCE(user_mode(regs)); if (test_thread_flag(TIF_BLOCKSTEP)) { /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." but PTRACE_BLOCKSTEP requested * it for userspace, but we just took a kernel #DB, so re-set * BTF. */ unsigned long debugctl; rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* * The kernel doesn't use INT1 */ if (!dr6) goto out; if (notify_debug(regs, &dr6)) goto out; /* * The kernel doesn't use TF single-step outside of: * * - Kprobes, consumed through kprobe_debug_handler() * - KGDB, consumed through notify_debug() * * So if we get here with DR_STEP set, something is wonky. * * A known way to trigger this is through QEMU's GDB stub, * which leaks #DB into the guest and causes IST recursion. */ if (WARN_ON_ONCE(dr6 & DR_STEP)) regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; /* * If something gets miswired and we end up here for a kernel mode * #DB, we will malfunction. */ WARN_ON_ONCE(!user_mode(regs)); /* * NB: We can't easily clear DR7 here because * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be * fine. */ irqentry_enter_from_user_mode(regs); instrumentation_begin(); /* * Start the virtual/ptrace DR6 value with just the DR_STEP mask * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. * * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) * even if it is not the result of PTRACE_SINGLESTEP. */ current->thread.virtual_dr6 = (dr6 & DR_STEP); /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep * TIF_BLOCKSTEP in sync with the hardware BTF flag. */ clear_thread_flag(TIF_BLOCKSTEP); /* * If dr6 has no reason to give us about the origin of this trap, * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ icebp = !dr6; if (notify_debug(regs, &dr6)) goto out; /* It's safe to allow irq's after DR6 has been saved */ local_irq_enable(); if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); goto out_irq; } /* #DB for bus lock can only be triggered from userspace. */ if (dr6 & DR_BUS_LOCK) handle_bus_lock(regs); /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) send_sigtrap(regs, 0, get_si_code(dr6)); out_irq: local_irq_disable(); out: instrumentation_end(); irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { exc_debug_kernel(regs, debug_read_clear_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_clear_dr6()); } #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { unsigned long dr6 = debug_read_clear_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); else exc_debug_kernel(regs, dr6); } #endif /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = &task->thread.fpu; int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; cond_local_irq_enable(regs); if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, 0, 0)) goto exit; task->thread.error_code = 0; task->thread.trap_nr = trapnr; if (notify_die(DIE_TRAP, str, regs, 0, trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, 0); goto exit; } /* * Synchronize the FPU register state to the memory register state * if necessary. This allows the exception handler to inspect it. */ fpu_sync_fpstate(fpu); task->thread.trap_nr = trapnr; task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: cond_local_irq_disable(regs); } DEFINE_IDTENTRY(exc_coprocessor_error) { math_error(regs, X86_TRAP_MF); } DEFINE_IDTENTRY(exc_simd_coprocessor_error) { if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ if (!static_cpu_has(X86_FEATURE_XMM)) { __exc_general_protection(regs, 0); return; } } math_error(regs, X86_TRAP_XF); } DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: * * PROBLEM: If the APIC subsystem is configured in mixed mode with * Virtual Wire mode implemented through the local APIC, an * interrupt vector of 0Fh (Intel reserved encoding) may be * generated by the local APIC (Int 15). This vector may be * generated upon receipt of a spurious interrupt (an interrupt * which is removed before the system receives the INTA sequence) * instead of the programmed 8259 spurious interrupt vector. * * IMPLICATION: The spurious interrupt vector programmed in the * 8259 is normally handled by an operating system's spurious * interrupt handler. However, a vector of 0Fh is unknown to some * operating systems, which would crash if this erratum occurred. * * In theory this could be limited to 32bit, but the handler is not * hurting and who knows which other CPUs suffer from this. */ } static bool handle_xfd_event(struct pt_regs *regs) { u64 xfd_err; int err; if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; rdmsrl(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; wrmsrl(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) return false; local_irq_enable(); err = xfd_enable_feature(xfd_err); switch (err) { case -EPERM: force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); break; case -EFAULT: force_sig(SIGSEGV); break; } local_irq_disable(); return true; } DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); if (handle_xfd_event(regs)) return; #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; cond_local_irq_enable(regs); info.regs = regs; math_emulate(&info); cond_local_irq_disable(regs); return; } #endif /* This should not happen. */ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { /* Try to fix it up and carry on. */ write_cr0(cr0 & ~X86_CR0_TS); } else { /* * Something terrible happened, and we're better off trying * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ die("unexpected #NM exception", regs, 0); } } #ifdef CONFIG_INTEL_TDX_GUEST #define VE_FAULT_STR "VE fault" static void ve_raise_fault(struct pt_regs *regs, long error_code, unsigned long address) { if (user_mode(regs)) { gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); return; } if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR, address)) { return; } die_addr(VE_FAULT_STR, regs, error_code, address); } /* * Virtualization Exceptions (#VE) are delivered to TDX guests due to * specific guest actions which may happen in either user space or the * kernel: * * * Specific instructions (WBINVD, for example) * * Specific MSR accesses * * Specific CPUID leaf accesses * * Access to specific guest physical addresses * * In the settings that Linux will run in, virtualization exceptions are * never generated on accesses to normal, TD-private memory that has been * accepted (by BIOS or with tdx_enc_status_changed()). * * Syscall entry code has a critical window where the kernel stack is not * yet set up. Any exception in this window leads to hard to debug issues * and can be exploited for privilege escalation. Exceptions in the NMI * entry code also cause issues. Returning from the exception handler with * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. * * For these reasons, the kernel avoids #VEs during the syscall gap and * the NMI entry code. Entry code paths do not access TD-shared memory, * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves * that might generate #VE. VMM can remove memory from TD at any point, * but access to unaccepted (or missing) private memory leads to VM * termination, not to #VE. * * Similarly to page faults and breakpoints, #VEs are allowed in NMI * handlers once the kernel is ready to deal with nested NMIs. * * During #VE delivery, all interrupts, including NMIs, are blocked until * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads * the VE info. * * If a guest kernel action which would normally cause a #VE occurs in * the interrupt-disabled region before TDGETVEINFO, a #DF (fault * exception) is delivered to the guest which will result in an oops. * * The entry code has been audited carefully for following these expectations. * Changes in the entry code have to be audited for correctness vs. this * aspect. Similarly to #PF, #VE in these places will expose kernel to * privilege escalation or may lead to random crashes. */ DEFINE_IDTENTRY(exc_virtualization_exception) { struct ve_info ve; /* * NMIs/Machine-checks/Interrupts will be in a disabled state * till TDGETVEINFO TDCALL is executed. This ensures that VE * info cannot be overwritten by a nested #VE. */ tdx_get_ve_info(&ve); cond_local_irq_enable(regs); /* * If tdx_handle_virt_exception() could not process * it successfully, treat it as #GP(0) and handle it. */ if (!tdx_handle_virt_exception(regs, &ve)) ve_raise_fault(regs, 0, ve.gla); cond_local_irq_disable(regs); } #endif #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { local_irq_enable(); if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } local_irq_disable(); } #endif void __init trap_init(void) { /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); /* Init GHCB memory pages when running as an SEV-ES guest */ sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(); /* Setup traps as cpu_init() might #GP */ idt_setup_traps(); cpu_init(); }
1 1 29 29 29 27 1 1 29 29 14 12 12 30 15 12 3 2 2 2 2 2 1 12 6 1 1 1 697 229 20 429 423 422 2 695 425 4 2 15 14 1 693 2 6 13 12 13 25 24 17 17 130 222 175 173 13 2 2 2 2 108 1 107 427 427 679 20 20 26 1218 1220 1162 597 760 154 6 697 647 1163 585 586 585 1219 99 1167 80 8 8 7 2 5 8 8 63 987 20 782 799 1995 1995 1993 949 951 951 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strlcpy the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif
1967 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BYTEORDER_GENERIC_H #define _LINUX_BYTEORDER_GENERIC_H /* * linux/byteorder/generic.h * Generic Byte-reordering support * * The "... p" macros, like le64_to_cpup, can be used with pointers * to unaligned data, but there will be a performance penalty on * some architectures. Use get_unaligned for unaligned data. * * Francois-Rene Rideau <fare@tunes.org> 19970707 * gathered all the good ideas from all asm-foo/byteorder.h into one file, * cleaned them up. * I hope it is compliant with non-GCC compilers. * I decided to put __BYTEORDER_HAS_U64__ in byteorder.h, * because I wasn't sure it would be ok to put it in types.h * Upgraded it to 2.1.43 * Francois-Rene Rideau <fare@tunes.org> 19971012 * Upgraded it to 2.1.57 * to please Linus T., replaced huge #ifdef's between little/big endian * by nestedly #include'd files. * Francois-Rene Rideau <fare@tunes.org> 19971205 * Made it to 2.1.71; now a facelift: * Put files under include/linux/byteorder/ * Split swab from generic support. * * TODO: * = Regular kernel maintainers could also replace all these manual * byteswap macros that remain, disseminated among drivers, * after some grep or the sources... * = Linus might want to rename all these macros and files to fit his taste, * to fit his personal naming scheme. * = it seems that a few drivers would also appreciate * nybble swapping support... * = every architecture could add their byteswap macro in asm/byteorder.h * see how some architectures already do (i386, alpha, ppc, etc) * = cpu_to_beXX and beXX_to_cpu might some day need to be well * distinguished throughout the kernel. This is not the case currently, * since little endian, big endian, and pdp endian machines needn't it. * But this might be the case for, say, a port of Linux to 20/21 bit * architectures (and F21 Linux addict around?). */ /* * The following macros are to be defined by <asm/byteorder.h>: * * Conversion of long and short int between network and host format * ntohl(__u32 x) * ntohs(__u16 x) * htonl(__u32 x) * htons(__u16 x) * It seems that some programs (which? where? or perhaps a standard? POSIX?) * might like the above to be functions, not macros (why?). * if that's true, then detect them, and take measures. * Anyway, the measure is: define only ___ntohl as a macro instead, * and in a separate file, have * unsigned long inline ntohl(x){return ___ntohl(x);} * * The same for constant arguments * __constant_ntohl(__u32 x) * __constant_ntohs(__u16 x) * __constant_htonl(__u32 x) * __constant_htons(__u16 x) * * Conversion of XX-bit integers (16- 32- or 64-) * between native CPU format and little/big endian format * 64-bit stuff only defined for proper architectures * cpu_to_[bl]eXX(__uXX x) * [bl]eXX_to_cpu(__uXX x) * * The same, but takes a pointer to the value to convert * cpu_to_[bl]eXXp(__uXX x) * [bl]eXX_to_cpup(__uXX x) * * The same, but change in situ * cpu_to_[bl]eXXs(__uXX x) * [bl]eXX_to_cpus(__uXX x) * * See asm-foo/byteorder.h for examples of how to provide * architecture-optimized versions * */ #define cpu_to_le64 __cpu_to_le64 #define le64_to_cpu __le64_to_cpu #define cpu_to_le32 __cpu_to_le32 #define le32_to_cpu __le32_to_cpu #define cpu_to_le16 __cpu_to_le16 #define le16_to_cpu __le16_to_cpu #define cpu_to_be64 __cpu_to_be64 #define be64_to_cpu __be64_to_cpu #define cpu_to_be32 __cpu_to_be32 #define be32_to_cpu __be32_to_cpu #define cpu_to_be16 __cpu_to_be16 #define be16_to_cpu __be16_to_cpu #define cpu_to_le64p __cpu_to_le64p #define le64_to_cpup __le64_to_cpup #define cpu_to_le32p __cpu_to_le32p #define le32_to_cpup __le32_to_cpup #define cpu_to_le16p __cpu_to_le16p #define le16_to_cpup __le16_to_cpup #define cpu_to_be64p __cpu_to_be64p #define be64_to_cpup __be64_to_cpup #define cpu_to_be32p __cpu_to_be32p #define be32_to_cpup __be32_to_cpup #define cpu_to_be16p __cpu_to_be16p #define be16_to_cpup __be16_to_cpup #define cpu_to_le64s __cpu_to_le64s #define le64_to_cpus __le64_to_cpus #define cpu_to_le32s __cpu_to_le32s #define le32_to_cpus __le32_to_cpus #define cpu_to_le16s __cpu_to_le16s #define le16_to_cpus __le16_to_cpus #define cpu_to_be64s __cpu_to_be64s #define be64_to_cpus __be64_to_cpus #define cpu_to_be32s __cpu_to_be32s #define be32_to_cpus __be32_to_cpus #define cpu_to_be16s __cpu_to_be16s #define be16_to_cpus __be16_to_cpus /* * They have to be macros in order to do the constant folding * correctly - if the argument passed into a inline function * it is no longer constant according to gcc.. */ #undef ntohl #undef ntohs #undef htonl #undef htons #define ___htonl(x) __cpu_to_be32(x) #define ___htons(x) __cpu_to_be16(x) #define ___ntohl(x) __be32_to_cpu(x) #define ___ntohs(x) __be16_to_cpu(x) #define htonl(x) ___htonl(x) #define ntohl(x) ___ntohl(x) #define htons(x) ___htons(x) #define ntohs(x) ___ntohs(x) static inline void le16_add_cpu(__le16 *var, u16 val) { *var = cpu_to_le16(le16_to_cpu(*var) + val); } static inline void le32_add_cpu(__le32 *var, u32 val) { *var = cpu_to_le32(le32_to_cpu(*var) + val); } static inline void le64_add_cpu(__le64 *var, u64 val) { *var = cpu_to_le64(le64_to_cpu(*var) + val); } /* XXX: this stuff can be optimized */ static inline void le32_to_cpu_array(u32 *buf, unsigned int words) { while (words--) { __le32_to_cpus(buf); buf++; } } static inline void cpu_to_le32_array(u32 *buf, unsigned int words) { while (words--) { __cpu_to_le32s(buf); buf++; } } static inline void be16_add_cpu(__be16 *var, u16 val) { *var = cpu_to_be16(be16_to_cpu(*var) + val); } static inline void be32_add_cpu(__be32 *var, u32 val) { *var = cpu_to_be32(be32_to_cpu(*var) + val); } static inline void be64_add_cpu(__be64 *var, u64 val) { *var = cpu_to_be64(be64_to_cpu(*var) + val); } static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len) { size_t i; for (i = 0; i < len; i++) dst[i] = cpu_to_be32(src[i]); } static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len) { size_t i; for (i = 0; i < len; i++) dst[i] = be32_to_cpu(src[i]); } #endif /* _LINUX_BYTEORDER_GENERIC_H */
2 2 23 23 23 23 23 24 4 19 1 1 23 24 24 7 41 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 // SPDX-License-Identifier: GPL-2.0 /* * XFRM virtual interface * * Copyright (C) 2018 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_link.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; static const struct net_device_ops xfrmi_netdev_ops; #define XFRMI_HASH_BITS 8 #define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; struct xfrm_if __rcu *collect_md_xfrmi; }; static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), }; static void xfrmi_destroy_state(struct lwtunnel_state *lwt) { } static int xfrmi_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_XFRM_MAX + 1]; struct lwtunnel_state *new_state; struct xfrm_md_info *info; int ret; ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); if (ret < 0) return ret; if (!tb[LWT_XFRM_IF_ID]) { NL_SET_ERR_MSG(extack, "if_id must be set"); return -EINVAL; } new_state = lwtunnel_state_alloc(sizeof(*info)); if (!new_state) { NL_SET_ERR_MSG(extack, "failed to create encap info"); return -ENOMEM; } new_state->type = LWTUNNEL_ENCAP_XFRM; info = lwt_xfrm_info(new_state); info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); if (tb[LWT_XFRM_LINK]) info->link = nla_get_u32(tb[LWT_XFRM_LINK]); *ts = new_state; return 0; } static int xfrmi_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct xfrm_md_info *info = lwt_xfrm_info(lwt); if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) return -EMSGSIZE; return 0; } static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ } static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct xfrm_md_info *a_info = lwt_xfrm_info(a); struct xfrm_md_info *b_info = lwt_xfrm_info(b); return memcmp(a_info, b_info, sizeof(*a_info)); } static const struct lwtunnel_encap_ops xfrmi_encap_ops = { .build_state = xfrmi_build_state, .destroy_state = xfrmi_destroy_state, .fill_encap = xfrmi_fill_encap_info, .get_encap_size = xfrmi_encap_nlsize, .cmp_encap = xfrmi_encap_cmp, .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) static u32 xfrmi_hash(u32 if_id) { return hash_32(if_id, XFRMI_HASH_BITS); } static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; } xi = rcu_dereference(xfrmn->collect_md_xfrmi); if (xi && (xi->dev->flags & IFF_UP)) return xi; return NULL; } static bool xfrmi_decode_session(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res) { struct net_device *dev; struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return false; switch (family) { case AF_INET6: ifindex = inet6_sdif(skb); break; case AF_INET: ifindex = inet_sdif(skb); break; } if (ifindex) { struct net *net = xs_net(xfrm_input_state(skb)); dev = dev_get_by_index_rcu(net, ifindex); } else { dev = skb->dev; } if (!dev || !(dev->flags & IFF_UP)) return false; if (dev->netdev_ops != &xfrmi_netdev_ops) return false; xi = netdev_priv(dev); res->net = xi->net; if (xi->p.collect_md) res->if_id = xfrm_input_state(skb)->if_id; else res->if_id = xi->p.if_id; return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); } static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip; struct xfrm_if *iter; for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { rcu_assign_pointer(*xip, xi->next); break; } } } static void xfrmi_dev_free(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); free_percpu(dev->tstats); } static int xfrmi_create(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = dev_net(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; dev->rtnl_link_ops = &xfrmi_link_ops; err = register_netdevice(dev); if (err < 0) goto out; if (xi->p.collect_md) rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); else xfrmi_link(xfrmn, xi); return 0; out: return err; } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) return xi; return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); if (xi->p.collect_md) RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); else xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) return; ipvs_reset(skb); secpath_reset(skb); skb_orphan(skb); skb->mark = 0; } static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, unsigned short family) { struct sec_path *sp; sp = skb_sec_path(skb); if (sp && (sp->len || sp->olen) && !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) goto discard; XFRM_SPI_SKB_CB(skb)->family = family; if (family == AF_INET) { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; } else { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; } return xfrm_input(skb, nexthdr, spi, encap_type); discard: kfree_skb(skb); return 0; } static int xfrmi4_rcv(struct sk_buff *skb) { return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); } static int xfrmi6_rcv(struct sk_buff *skb) { return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, 0, AF_INET6); } static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); } static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); } static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; bool xnet; int link; if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); xi = xfrmi_lookup(xs_net(x), x); if (!xi) return 1; link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, inner_mode->family)) return -EPERM; } xfrmi_scrub_packet(skb, xnet); if (xi->p.collect_md) { struct metadata_dst *md_dst; md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); if (!md_dst) return -ENOMEM; md_dst->u.xfrm_info.if_id = x->if_id; md_dst->u.xfrm_info.link = link; skb_dst_set(skb, (struct dst_entry *)md_dst); } dev_sw_netstats_rx_add(dev, skb->len); return 0; } static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; struct xfrm_state *x; int err = -1; u32 if_id; int mtu; if (xi->p.collect_md) { struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); if (unlikely(!md_info)) return -EINVAL; if_id = md_info->if_id; fl->flowi_oif = md_info->link; if (md_info->dst_orig) { struct dst_entry *tmp_dst = dst; dst = md_info->dst_orig; skb_dst_set(skb, dst); md_info->dst_orig = NULL; dst_release(tmp_dst); } } else { if_id = xi->p.if_id; } dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } x = dst->xfrm; if (!x) goto tx_err_link_failure; if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > 1280) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); else goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } dst_release(dst); return -EMSGSIZE; } xmit: xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = tdev; err = dst_output(xi->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); if (!dst) { struct rtable *rt; fl.u.ip4.flowi4_oif = dev->ifindex; fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); } break; default: goto tx_err; } fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int xfrmi4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->protocol; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->nexthdr; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) { if (xi->p.link != p->link) return -EINVAL; xi->p.if_id = p->if_id; return 0; } static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; xfrmi_unlink(xfrmn, xi); synchronize_net(); err = xfrmi_change(xi, p); xfrmi_link(xfrmn, xi); netdev_state_change(xi->dev); return err; } static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return xi->p.link; } static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); } #define XFRMI_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = gro_cells_init(&xi->gro_cells, dev); if (err) { free_percpu(dev->tstats); return err; } dev->features |= NETIF_F_LLTX; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; dev->needed_tailroom = phydev->needed_tailroom; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, phydev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); } else { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); } return 0; } static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void xfrmi_netlink_parms(struct nlattr *data[], struct xfrm_if_parms *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_XFRM_LINK]) parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); if (data[IFLA_XFRM_COLLECT_METADATA]) parms->collect_md = true; } static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct xfrm_if_parms p = {}; struct xfrm_if *xi; int err; xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); if (p.link || p.if_id) { NL_SET_ERR_MSG(extack, "link and if_id must be zero"); return -EINVAL; } if (rtnl_dereference(xfrmn->collect_md_xfrmi)) return -EEXIST; } else { if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; } xi = netdev_priv(dev); xi->p = p; xi->net = net; xi->dev = dev; err = xfrmi_create(dev); return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } if (p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; if (xi->p.collect_md) { NL_SET_ERR_MSG(extack, "device can't be changed to collect_md"); return -EINVAL; } } return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) { return /* IFLA_XFRM_LINK */ nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + /* IFLA_XFRM_COLLECT_METADATA */ nla_total_size(0) + 0; } static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return xi->net; } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, [IFLA_XFRM_LINK] = { .type = NLA_U32 }, [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .kind = "xfrm", .maxtype = IFLA_XFRM_MAX, .policy = xfrmi_policy, .priv_size = sizeof(struct xfrm_if), .setup = xfrmi_dev_setup, .validate = xfrmi_validate, .newlink = xfrmi_newlink, .dellink = xfrmi_dellink, .changelink = xfrmi_changelink, .get_size = xfrmi_get_size, .fill_info = xfrmi_fill_info, .get_link_net = xfrmi_get_link_net, }; static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) { struct net *net; LIST_HEAD(list); rtnl_lock(); list_for_each_entry(net, net_exit_list, exit_list) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; int i; for (i = 0; i < XFRMI_HASH_SIZE; i++) { for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) unregister_netdevice_queue(xi->dev, &list); } xi = rtnl_dereference(xfrmn->collect_md_xfrmi); if (xi) unregister_netdevice_queue(xi->dev, &list); } unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations xfrmi_net_ops = { .exit_batch = xfrmi_exit_batch_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrmi6_rcv, .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int xfrmi6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrmi4_rcv, .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) static int xfrmi4_rcv_tunnel(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 2, }; #endif static int __init xfrmi4_init(void) { int err; err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm_tunnel_ipip6_failed: xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi4_fini(void) { #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); } static int __init xfrmi6_init(void) { int err; err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ip6ip_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm_tunnel_ip6ip_failed: xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); xfrm_tunnel_ipv6_failed: xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi6_fini(void) { #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); } static const struct xfrm_if_cb xfrm_if_cb = { .decode_session = xfrmi_decode_session, }; static int __init xfrmi_init(void) { const char *msg; int err; pr_info("IPsec XFRM device driver\n"); msg = "tunnel device"; err = register_pernet_device(&xfrmi_net_ops); if (err < 0) goto pernet_dev_failed; msg = "xfrm4 protocols"; err = xfrmi4_init(); if (err < 0) goto xfrmi4_failed; msg = "xfrm6 protocols"; err = xfrmi6_init(); if (err < 0) goto xfrmi6_failed; msg = "netlink interface"; err = rtnl_link_register(&xfrmi_link_ops); if (err < 0) goto rtnl_link_failed; err = register_xfrm_interface_bpf(); if (err < 0) goto kfunc_failed; lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; kfunc_failed: rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: xfrmi4_fini(); xfrmi4_failed: unregister_pernet_device(&xfrmi_net_ops); pernet_dev_failed: pr_err("xfrmi init: failed to register %s\n", msg); return err; } static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); unregister_pernet_device(&xfrmi_net_ops); } module_init(xfrmi_init); module_exit(xfrmi_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("xfrm"); MODULE_ALIAS_NETDEV("xfrm0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("XFRM virtual interface");
6 2 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* * net/tipc/addr.c: TIPC address utility routines * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "addr.h" #include "core.h" bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) return true; if (!legacy_format) return false; if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */ return true; return false; } void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); tn->trial_addr = hash128to32(id); pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } void tipc_set_node_addr(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); u8 node_id[NODE_ID_LEN] = {0,}; tn->node_addr = addr; if (!tipc_own_id(net)) { sprintf(node_id, "%x", addr); tipc_set_node_id(net, node_id); } tn->trial_addr = addr; tn->addr_trial_end = jiffies; pr_info("Node number set to %u\n", addr); } char *tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; /* Already a string ? */ for (i = 0; i < NODE_ID_LEN; i++) { c = id[i]; if (c >= '0' && c <= '9') continue; if (c >= 'A' && c <= 'Z') continue; if (c >= 'a' && c <= 'z') continue; if (c == '.') continue; if (c == ':') continue; if (c == '_') continue; if (c == '-') continue; if (c == '@') continue; if (c != 0) break; } if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; return str; } /* Translate to hex string */ for (i = 0; i < NODE_ID_LEN; i++) sprintf(&str[2 * i], "%02x", id[i]); /* Strip off trailing zeroes */ for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; return str; }
2423 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor network mediation definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #ifndef __AA_NET_H #define __AA_NET_H #include <net/sock.h> #include <linux/path.h> #include "apparmorfs.h" #include "label.h" #include "perms.h" #include "policy.h" #define AA_MAY_SEND AA_MAY_WRITE #define AA_MAY_RECEIVE AA_MAY_READ #define AA_MAY_SHUTDOWN AA_MAY_DELETE #define AA_MAY_CONNECT AA_MAY_OPEN #define AA_MAY_ACCEPT 0x00100000 #define AA_MAY_BIND 0x00200000 #define AA_MAY_LISTEN 0x00400000 #define AA_MAY_SETOPT 0x01000000 #define AA_MAY_GETOPT 0x02000000 #define NET_PERMS_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ AA_MAY_SHUTDOWN | AA_MAY_BIND | AA_MAY_LISTEN | \ AA_MAY_CONNECT | AA_MAY_ACCEPT | AA_MAY_SETATTR | \ AA_MAY_GETATTR | AA_MAY_SETOPT | AA_MAY_GETOPT) #define NET_FS_PERMS (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CREATE | \ AA_MAY_SHUTDOWN | AA_MAY_CONNECT | AA_MAY_RENAME |\ AA_MAY_SETATTR | AA_MAY_GETATTR | AA_MAY_CHMOD | \ AA_MAY_CHOWN | AA_MAY_CHGRP | AA_MAY_LOCK | \ AA_MAY_MPROT) #define NET_PEER_MASK (AA_MAY_SEND | AA_MAY_RECEIVE | AA_MAY_CONNECT | \ AA_MAY_ACCEPT) struct aa_sk_ctx { struct aa_label *label; struct aa_label *peer; }; #define SK_CTX(X) ((X)->sk_security) static inline struct aa_sk_ctx *aa_sock(const struct sock *sk) { return sk->sk_security; } #define DEFINE_AUDIT_NET(NAME, OP, SK, F, T, P) \ struct lsm_network_audit NAME ## _net = { .sk = (SK), \ .family = (F)}; \ DEFINE_AUDIT_DATA(NAME, \ ((SK) && (F) != AF_UNIX) ? LSM_AUDIT_DATA_NET : \ LSM_AUDIT_DATA_NONE, \ AA_CLASS_NET, \ OP); \ NAME.common.u.net = &(NAME ## _net); \ NAME.net.type = (T); \ NAME.net.protocol = (P) #define DEFINE_AUDIT_SK(NAME, OP, SK) \ DEFINE_AUDIT_NET(NAME, OP, SK, (SK)->sk_family, (SK)->sk_type, \ (SK)->sk_protocol) #define af_select(FAMILY, FN, DEF_FN) \ ({ \ int __e; \ switch ((FAMILY)) { \ default: \ __e = DEF_FN; \ } \ __e; \ }) struct aa_secmark { u8 audit; u8 deny; u32 secid; char *label; }; extern struct aa_sfs_entry aa_sfs_entry_network[]; void audit_net_cb(struct audit_buffer *ab, void *va); int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, u16 family, int type); int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol); static inline int aa_profile_af_sk_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, struct sock *sk) { return aa_profile_af_perm(profile, ad, request, sk->sk_family, sk->sk_type); } int aa_sk_perm(const char *op, u32 request, struct sock *sk); int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct socket *sock); int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk); #endif /* __AA_NET_H */
64 47 63 988 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 // SPDX-License-Identifier: GPL-2.0-or-later /* * tsacct.c - System accounting over taskstats interface * * Copyright (C) Jay Lan, <jlan@sgi.com> */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/sched/cputime.h> #include <linux/tsacct_kern.h> #include <linux/acct.h> #include <linux/jiffies.h> #include <linux/mm.h> /* * fill in basic accounting fields */ void bacct_add_tsk(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; u64 now_ns, delta; time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in nsec */ now_ns = ktime_get_ns(); /* store whole group time first */ delta = now_ns - tsk->group_leader->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); stats->ac_tgetime = delta; delta = now_ns - tsk->start_time; do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); stats->ac_btime64 = btime; if (tsk->flags & PF_EXITING) stats->ac_exitcode = tsk->exit_code; if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC)) stats->ac_flag |= AFORK; if (tsk->flags & PF_SUPERPRIV) stats->ac_flag |= ASU; if (tsk->flags & PF_DUMPCORE) stats->ac_flag |= ACORE; if (tsk->flags & PF_SIGNALED) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); stats->ac_ppid = pid_alive(tsk) ? task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); task_cputime(tsk, &utime, &stime); stats->ac_utime = div_u64(utime, NSEC_PER_USEC); stats->ac_stime = div_u64(stime, NSEC_PER_USEC); task_cputime_scaled(tsk, &utimescaled, &stimescaled); stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC); stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC); stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); } #ifdef CONFIG_TASK_XACCT #define KB 1024 #define MB (1024*KB) #define KB_MASK (~(KB-1)) /* * fill in extended accounting fields */ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; do_div(stats->coremem, 1000 * KB); stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar & KB_MASK; stats->write_char = p->ioac.wchar & KB_MASK; stats->read_syscalls = p->ioac.syscr & KB_MASK; stats->write_syscalls = p->ioac.syscw & KB_MASK; #ifdef CONFIG_TASK_IO_ACCOUNTING stats->read_bytes = p->ioac.read_bytes & KB_MASK; stats->write_bytes = p->ioac.write_bytes & KB_MASK; stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK; #else stats->read_bytes = 0; stats->write_bytes = 0; stats->cancelled_write_bytes = 0; #endif } #undef KB #undef MB static void __acct_update_integrals(struct task_struct *tsk, u64 utime, u64 stime) { u64 time, delta; if (!likely(tsk->mm)) return; time = stime + utime; delta = time - tsk->acct_timexpd; if (delta < TICK_NSEC) return; tsk->acct_timexpd = time; /* * Divide by 1024 to avoid overflow, and to avoid division. * The final unit reported to userspace is Mbyte-usecs, * the rest of the math is done in xacct_add_tsk. */ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10; } /** * acct_update_integrals - update mm integral fields in task_struct * @tsk: task_struct for accounting */ void acct_update_integrals(struct task_struct *tsk) { u64 utime, stime; unsigned long flags; local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); local_irq_restore(flags); } /** * acct_account_cputime - update mm integral after cputime update * @tsk: task_struct for accounting */ void acct_account_cputime(struct task_struct *tsk) { __acct_update_integrals(tsk, tsk->utime, tsk->stime); } /** * acct_clear_integrals - clear the mm integral fields in task_struct * @tsk: task_struct whose accounting fields are cleared */ void acct_clear_integrals(struct task_struct *tsk) { tsk->acct_timexpd = 0; tsk->acct_rss_mem1 = 0; tsk->acct_vm_mem1 = 0; } #endif
7 7 7 7 7 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 // SPDX-License-Identifier: GPL-2.0-or-later /* * Point-to-Point Tunneling Protocol for Linux * * Authors: Dmitry Kozlov <xeb@mail.ru> */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/if_pppox.h> #include <linux/ppp-ioctl.h> #include <linux/notifier.h> #include <linux/file.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/rcupdate.h> #include <linux/security.h> #include <linux/spinlock.h> #include <net/sock.h> #include <net/protocol.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/gre.h> #include <net/pptp.h> #include <linux/uaccess.h> #define PPTP_DRIVER_VERSION "0.8.5" #define MAX_CALLID 65535 static DECLARE_BITMAP(callid_bitmap, MAX_CALLID + 1); static struct pppox_sock __rcu **callid_sock; static DEFINE_SPINLOCK(chan_lock); static struct proto pptp_sk_proto __read_mostly; static const struct ppp_channel_ops pptp_chan_ops; static const struct proto_ops pptp_ops; static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr) { struct pppox_sock *sock; struct pptp_opt *opt; rcu_read_lock(); sock = rcu_dereference(callid_sock[call_id]); if (sock) { opt = &sock->proto.pptp; if (opt->dst_addr.sin_addr.s_addr != s_addr) sock = NULL; else sock_hold(sk_pppox(sock)); } rcu_read_unlock(); return sock; } static int lookup_chan_dst(u16 call_id, __be32 d_addr) { struct pppox_sock *sock; struct pptp_opt *opt; int i; rcu_read_lock(); i = 1; for_each_set_bit_from(i, callid_bitmap, MAX_CALLID) { sock = rcu_dereference(callid_sock[i]); if (!sock) continue; opt = &sock->proto.pptp; if (opt->dst_addr.call_id == call_id && opt->dst_addr.sin_addr.s_addr == d_addr) break; } rcu_read_unlock(); return i < MAX_CALLID; } static int add_chan(struct pppox_sock *sock, struct pptp_addr *sa) { static int call_id; spin_lock(&chan_lock); if (!sa->call_id) { call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, call_id + 1); if (call_id == MAX_CALLID) { call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, 1); if (call_id == MAX_CALLID) goto out_err; } sa->call_id = call_id; } else if (test_bit(sa->call_id, callid_bitmap)) { goto out_err; } sock->proto.pptp.src_addr = *sa; set_bit(sa->call_id, callid_bitmap); rcu_assign_pointer(callid_sock[sa->call_id], sock); spin_unlock(&chan_lock); return 0; out_err: spin_unlock(&chan_lock); return -1; } static void del_chan(struct pppox_sock *sock) { spin_lock(&chan_lock); clear_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap); RCU_INIT_POINTER(callid_sock[sock->proto.pptp.src_addr.call_id], NULL); spin_unlock(&chan_lock); } static struct rtable *pptp_route_output(const struct pppox_sock *po, struct flowi4 *fl4) { const struct sock *sk = &po->sk; struct net *net; net = sock_net(sk); flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 0, RT_SCOPE_UNIVERSE, IPPROTO_GRE, 0, po->proto.pptp.dst_addr.sin_addr.s_addr, po->proto.pptp.src_addr.sin_addr.s_addr, 0, 0, sock_net_uid(net, sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = chan->private; struct pppox_sock *po = pppox_sk(sk); struct net *net = sock_net(sk); struct pptp_opt *opt = &po->proto.pptp; struct pptp_gre_header *hdr; unsigned int header_len = sizeof(*hdr); struct flowi4 fl4; int islcp; int len; unsigned char *data; __u32 seq_recv; struct rtable *rt; struct net_device *tdev; struct iphdr *iph; int max_headroom; if (sk_pppox(po)->sk_state & PPPOX_DEAD) goto tx_error; rt = pptp_route_output(po, &fl4); if (IS_ERR(rt)) goto tx_error; tdev = rt->dst.dev; max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph) + sizeof(*hdr) + 2; if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); goto tx_error; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } data = skb->data; islcp = ((data[0] << 8) + data[1]) == PPP_LCP && 1 <= data[2] && data[2] <= 7; /* compress protocol field */ if ((opt->ppp_flags & SC_COMP_PROT) && data[0] == 0 && !islcp) skb_pull(skb, 1); /* Put in the address/control bytes if necessary */ if ((opt->ppp_flags & SC_COMP_AC) == 0 || islcp) { data = skb_push(skb, 2); data[0] = PPP_ALLSTATIONS; data[1] = PPP_UI; } len = skb->len; seq_recv = opt->seq_recv; if (opt->ack_sent == seq_recv) header_len -= sizeof(hdr->ack); /* Push down and install GRE header */ skb_push(skb, header_len); hdr = (struct pptp_gre_header *)(skb->data); hdr->gre_hd.flags = GRE_KEY | GRE_VERSION_1 | GRE_SEQ; hdr->gre_hd.protocol = GRE_PROTO_PPP; hdr->call_id = htons(opt->dst_addr.call_id); hdr->seq = htonl(++opt->seq_sent); if (opt->ack_sent != seq_recv) { /* send ack with this message */ hdr->gre_hd.flags |= GRE_ACK; hdr->ack = htonl(seq_recv); opt->ack_sent = seq_recv; } hdr->payload_len = htons(len); /* Push down and install the IP header. */ skb_reset_transport_header(skb); skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; if (ip_dont_fragment(sk, &rt->dst)) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->protocol = IPPROTO_GRE; iph->tos = 0; iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; iph->ttl = ip4_dst_hoplimit(&rt->dst); iph->tot_len = htons(skb->len); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); nf_reset_ct(skb); skb->ip_summed = CHECKSUM_NONE; ip_select_ident(net, skb, NULL); ip_send_check(iph); ip_local_out(net, skb->sk, skb); return 1; tx_error: kfree_skb(skb); return 1; } static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; int headersize, payload_len, seq; __u8 *payload; struct pptp_gre_header *header; if (!(sk->sk_state & PPPOX_CONNECTED)) { if (sock_queue_rcv_skb(sk, skb)) goto drop; return NET_RX_SUCCESS; } header = (struct pptp_gre_header *)(skb->data); headersize = sizeof(*header); /* test if acknowledgement present */ if (GRE_IS_ACK(header->gre_hd.flags)) { __u32 ack; if (!pskb_may_pull(skb, headersize)) goto drop; header = (struct pptp_gre_header *)(skb->data); /* ack in different place if S = 0 */ ack = GRE_IS_SEQ(header->gre_hd.flags) ? ntohl(header->ack) : ntohl(header->seq); if (ack > opt->ack_recv) opt->ack_recv = ack; /* also handle sequence number wrap-around */ if (WRAPPED(ack, opt->ack_recv)) opt->ack_recv = ack; } else { headersize -= sizeof(header->ack); } /* test if payload present */ if (!GRE_IS_SEQ(header->gre_hd.flags)) goto drop; payload_len = ntohs(header->payload_len); seq = ntohl(header->seq); /* check for incomplete packet (length smaller than expected) */ if (!pskb_may_pull(skb, headersize + payload_len)) goto drop; payload = skb->data + headersize; /* check for expected sequence number */ if (seq < opt->seq_recv + 1 || WRAPPED(opt->seq_recv, seq)) { if ((payload[0] == PPP_ALLSTATIONS) && (payload[1] == PPP_UI) && (PPP_PROTOCOL(payload) == PPP_LCP) && ((payload[4] == PPP_LCP_ECHOREQ) || (payload[4] == PPP_LCP_ECHOREP))) goto allow_packet; } else { opt->seq_recv = seq; allow_packet: skb_pull(skb, headersize); if (payload[0] == PPP_ALLSTATIONS && payload[1] == PPP_UI) { /* chop off address/control */ if (skb->len < 3) goto drop; skb_pull(skb, 2); } skb->ip_summed = CHECKSUM_NONE; skb_set_network_header(skb, skb->head-skb->data); ppp_input(&po->chan, skb); return NET_RX_SUCCESS; } drop: kfree_skb(skb); return NET_RX_DROP; } static int pptp_rcv(struct sk_buff *skb) { struct pppox_sock *po; struct pptp_gre_header *header; struct iphdr *iph; if (skb->pkt_type != PACKET_HOST) goto drop; if (!pskb_may_pull(skb, 12)) goto drop; iph = ip_hdr(skb); header = (struct pptp_gre_header *)skb->data; if (header->gre_hd.protocol != GRE_PROTO_PPP || /* PPTP-GRE protocol for PPTP */ GRE_IS_CSUM(header->gre_hd.flags) || /* flag CSUM should be clear */ GRE_IS_ROUTING(header->gre_hd.flags) || /* flag ROUTING should be clear */ !GRE_IS_KEY(header->gre_hd.flags) || /* flag KEY should be set */ (header->gre_hd.flags & GRE_FLAGS)) /* flag Recursion Ctrl should be clear */ /* if invalid, discard this packet */ goto drop; po = lookup_chan(ntohs(header->call_id), iph->saddr); if (po) { skb_dst_drop(skb); nf_reset_ct(skb); return sk_receive_skb(sk_pppox(po), skb, 0); } drop: kfree_skb(skb); return NET_RX_DROP; } static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr; struct pppox_sock *po = pppox_sk(sk); int error = 0; if (sockaddr_len < sizeof(struct sockaddr_pppox)) return -EINVAL; lock_sock(sk); if (sk->sk_state & PPPOX_DEAD) { error = -EALREADY; goto out; } if (sk->sk_state & PPPOX_BOUND) { error = -EBUSY; goto out; } if (add_chan(po, &sp->sa_addr.pptp)) error = -EBUSY; else sk->sk_state |= PPPOX_BOUND; out: release_sock(sk); return error; } static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr; struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; struct rtable *rt; struct flowi4 fl4; int error = 0; if (sockaddr_len < sizeof(struct sockaddr_pppox)) return -EINVAL; if (sp->sa_protocol != PX_PROTO_PPTP) return -EINVAL; if (lookup_chan_dst(sp->sa_addr.pptp.call_id, sp->sa_addr.pptp.sin_addr.s_addr)) return -EALREADY; lock_sock(sk); /* Check for already bound sockets */ if (sk->sk_state & PPPOX_CONNECTED) { error = -EBUSY; goto end; } /* Check for already disconnected sockets, on attempts to disconnect */ if (sk->sk_state & PPPOX_DEAD) { error = -EALREADY; goto end; } if (!opt->src_addr.sin_addr.s_addr || !sp->sa_addr.pptp.sin_addr.s_addr) { error = -EINVAL; goto end; } po->chan.private = sk; po->chan.ops = &pptp_chan_ops; rt = pptp_route_output(po, &fl4); if (IS_ERR(rt)) { error = -EHOSTUNREACH; goto end; } sk_setup_caps(sk, &rt->dst); po->chan.mtu = dst_mtu(&rt->dst); if (!po->chan.mtu) po->chan.mtu = PPP_MRU; po->chan.mtu -= PPTP_HEADER_OVERHEAD; po->chan.hdrlen = 2 + sizeof(struct pptp_gre_header); error = ppp_register_channel(&po->chan); if (error) { pr_err("PPTP: failed to register PPP channel (%d)\n", error); goto end; } opt->dst_addr = sp->sa_addr.pptp; sk->sk_state |= PPPOX_CONNECTED; end: release_sock(sk); return error; } static int pptp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; memset(&sp.sa_addr, 0, sizeof(sp.sa_addr)); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_PPTP; sp.sa_addr.pptp = pppox_sk(sock->sk)->proto.pptp.src_addr; memcpy(uaddr, &sp, len); return len; } static int pptp_release(struct socket *sock) { struct sock *sk = sock->sk; struct pppox_sock *po; int error = 0; if (!sk) return 0; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { release_sock(sk); return -EBADF; } po = pppox_sk(sk); del_chan(po); synchronize_rcu(); pppox_unbind_sock(sk); sk->sk_state = PPPOX_DEAD; sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return error; } static void pptp_sock_destruct(struct sock *sk) { if (!(sk->sk_state & PPPOX_DEAD)) { del_chan(pppox_sk(sk)); pppox_unbind_sock(sk); } skb_queue_purge(&sk->sk_receive_queue); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); } static int pptp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; struct pppox_sock *po; struct pptp_opt *opt; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto, kern); if (!sk) goto out; sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = &pptp_ops; sk->sk_backlog_rcv = pptp_rcv_core; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; sk->sk_protocol = PX_PROTO_PPTP; sk->sk_destruct = pptp_sock_destruct; po = pppox_sk(sk); opt = &po->proto.pptp; opt->seq_sent = 0; opt->seq_recv = 0xffffffff; opt->ack_recv = 0; opt->ack_sent = 0xffffffff; error = 0; out: return error; } static int pptp_ppp_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) { struct sock *sk = chan->private; struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; void __user *argp = (void __user *)arg; int __user *p = argp; int err, val; err = -EFAULT; switch (cmd) { case PPPIOCGFLAGS: val = opt->ppp_flags; if (put_user(val, p)) break; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, p)) break; opt->ppp_flags = val & ~SC_RCV_BITS; err = 0; break; default: err = -ENOTTY; } return err; } static const struct ppp_channel_ops pptp_chan_ops = { .start_xmit = pptp_xmit, .ioctl = pptp_ppp_ioctl, }; static struct proto pptp_sk_proto __read_mostly = { .name = "PPTP", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; static const struct proto_ops pptp_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pptp_release, .bind = pptp_bind, .connect = pptp_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pptp_getname, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppox_pptp_proto = { .create = pptp_create, .owner = THIS_MODULE, }; static const struct gre_protocol gre_pptp_protocol = { .handler = pptp_rcv, }; static int __init pptp_init_module(void) { int err = 0; pr_info("PPTP driver version " PPTP_DRIVER_VERSION "\n"); callid_sock = vzalloc(array_size(sizeof(void *), (MAX_CALLID + 1))); if (!callid_sock) return -ENOMEM; err = gre_add_protocol(&gre_pptp_protocol, GREPROTO_PPTP); if (err) { pr_err("PPTP: can't add gre protocol\n"); goto out_mem_free; } err = proto_register(&pptp_sk_proto, 0); if (err) { pr_err("PPTP: can't register sk_proto\n"); goto out_gre_del_protocol; } err = register_pppox_proto(PX_PROTO_PPTP, &pppox_pptp_proto); if (err) { pr_err("PPTP: can't register pppox_proto\n"); goto out_unregister_sk_proto; } return 0; out_unregister_sk_proto: proto_unregister(&pptp_sk_proto); out_gre_del_protocol: gre_del_protocol(&gre_pptp_protocol, GREPROTO_PPTP); out_mem_free: vfree(callid_sock); return err; } static void __exit pptp_exit_module(void) { unregister_pppox_proto(PX_PROTO_PPTP); proto_unregister(&pptp_sk_proto); gre_del_protocol(&gre_pptp_protocol, GREPROTO_PPTP); vfree(callid_sock); } module_init(pptp_init_module); module_exit(pptp_exit_module); MODULE_DESCRIPTION("Point-to-Point Tunneling Protocol"); MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_PPTP);
10 8 8 16 22 8 1 1 9 1 10 1 10 2 2 1 1 1 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2023 Isovalent */ #ifndef __BPF_MPROG_H #define __BPF_MPROG_H #include <linux/bpf.h> /* bpf_mprog framework: * * bpf_mprog is a generic layer for multi-program attachment. In-kernel users * of the bpf_mprog don't need to care about the dependency resolution * internals, they can just consume it with few API calls. Currently available * dependency directives are BPF_F_{BEFORE,AFTER} which enable insertion of * a BPF program or BPF link relative to an existing BPF program or BPF link * inside the multi-program array as well as prepend and append behavior if * no relative object was specified, see corresponding selftests for concrete * examples (e.g. tc_links and tc_opts test cases of test_progs). * * Usage of bpf_mprog_{attach,detach,query}() core APIs with pseudo code: * * Attach case: * * struct bpf_mprog_entry *entry, *entry_new; * int ret; * * // bpf_mprog user-side lock * // fetch active @entry from attach location * [...] * ret = bpf_mprog_attach(entry, &entry_new, [...]); * if (!ret) { * if (entry != entry_new) { * // swap @entry to @entry_new at attach location * // ensure there are no inflight users of @entry: * synchronize_rcu(); * } * bpf_mprog_commit(entry); * } else { * // error path, bail out, propagate @ret * } * // bpf_mprog user-side unlock * * Detach case: * * struct bpf_mprog_entry *entry, *entry_new; * int ret; * * // bpf_mprog user-side lock * // fetch active @entry from attach location * [...] * ret = bpf_mprog_detach(entry, &entry_new, [...]); * if (!ret) { * // all (*) marked is optional and depends on the use-case * // whether bpf_mprog_bundle should be freed or not * if (!bpf_mprog_total(entry_new)) (*) * entry_new = NULL (*) * // swap @entry to @entry_new at attach location * // ensure there are no inflight users of @entry: * synchronize_rcu(); * bpf_mprog_commit(entry); * if (!entry_new) (*) * // free bpf_mprog_bundle (*) * } else { * // error path, bail out, propagate @ret * } * // bpf_mprog user-side unlock * * Query case: * * struct bpf_mprog_entry *entry; * int ret; * * // bpf_mprog user-side lock * // fetch active @entry from attach location * [...] * ret = bpf_mprog_query(attr, uattr, entry); * // bpf_mprog user-side unlock * * Data/fast path: * * struct bpf_mprog_entry *entry; * struct bpf_mprog_fp *fp; * struct bpf_prog *prog; * int ret = [...]; * * rcu_read_lock(); * // fetch active @entry from attach location * [...] * bpf_mprog_foreach_prog(entry, fp, prog) { * ret = bpf_prog_run(prog, [...]); * // process @ret from program * } * [...] * rcu_read_unlock(); * * bpf_mprog locking considerations: * * bpf_mprog_{attach,detach,query}() must be protected by an external lock * (like RTNL in case of tcx). * * bpf_mprog_entry pointer can be an __rcu annotated pointer (in case of tcx * the netdevice has tcx_ingress and tcx_egress __rcu pointer) which gets * updated via rcu_assign_pointer() pointing to the active bpf_mprog_entry of * the bpf_mprog_bundle. * * Fast path accesses the active bpf_mprog_entry within RCU critical section * (in case of tcx it runs in NAPI which provides RCU protection there, * other users might need explicit rcu_read_lock()). The bpf_mprog_commit() * assumes that for the old bpf_mprog_entry there are no inflight users * anymore. * * The READ_ONCE()/WRITE_ONCE() pairing for bpf_mprog_fp's prog access is for * the replacement case where we don't swap the bpf_mprog_entry. */ #define bpf_mprog_foreach_tuple(entry, fp, cp, t) \ for (fp = &entry->fp_items[0], cp = &entry->parent->cp_items[0];\ ({ \ t.prog = READ_ONCE(fp->prog); \ t.link = cp->link; \ t.prog; \ }); \ fp++, cp++) #define bpf_mprog_foreach_prog(entry, fp, p) \ for (fp = &entry->fp_items[0]; \ (p = READ_ONCE(fp->prog)); \ fp++) #define BPF_MPROG_MAX 64 struct bpf_mprog_fp { struct bpf_prog *prog; }; struct bpf_mprog_cp { struct bpf_link *link; }; struct bpf_mprog_entry { struct bpf_mprog_fp fp_items[BPF_MPROG_MAX]; struct bpf_mprog_bundle *parent; }; struct bpf_mprog_bundle { struct bpf_mprog_entry a; struct bpf_mprog_entry b; struct bpf_mprog_cp cp_items[BPF_MPROG_MAX]; struct bpf_prog *ref; atomic64_t revision; u32 count; }; struct bpf_tuple { struct bpf_prog *prog; struct bpf_link *link; }; static inline struct bpf_mprog_entry * bpf_mprog_peer(const struct bpf_mprog_entry *entry) { if (entry == &entry->parent->a) return &entry->parent->b; else return &entry->parent->a; } static inline void bpf_mprog_bundle_init(struct bpf_mprog_bundle *bundle) { BUILD_BUG_ON(sizeof(bundle->a.fp_items[0]) > sizeof(u64)); BUILD_BUG_ON(ARRAY_SIZE(bundle->a.fp_items) != ARRAY_SIZE(bundle->cp_items)); memset(bundle, 0, sizeof(*bundle)); atomic64_set(&bundle->revision, 1); bundle->a.parent = bundle; bundle->b.parent = bundle; } static inline void bpf_mprog_inc(struct bpf_mprog_entry *entry) { entry->parent->count++; } static inline void bpf_mprog_dec(struct bpf_mprog_entry *entry) { entry->parent->count--; } static inline int bpf_mprog_max(void) { return ARRAY_SIZE(((struct bpf_mprog_entry *)NULL)->fp_items) - 1; } static inline int bpf_mprog_total(struct bpf_mprog_entry *entry) { int total = entry->parent->count; WARN_ON_ONCE(total > bpf_mprog_max()); return total; } static inline bool bpf_mprog_exists(struct bpf_mprog_entry *entry, struct bpf_prog *prog) { const struct bpf_mprog_fp *fp; const struct bpf_prog *tmp; bpf_mprog_foreach_prog(entry, fp, tmp) { if (tmp == prog) return true; } return false; } static inline void bpf_mprog_mark_for_release(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { WARN_ON_ONCE(entry->parent->ref); if (!tuple->link) entry->parent->ref = tuple->prog; } static inline void bpf_mprog_complete_release(struct bpf_mprog_entry *entry) { /* In the non-link case prog deletions can only drop the reference * to the prog after the bpf_mprog_entry got swapped and the * bpf_mprog ensured that there are no inflight users anymore. * * Paired with bpf_mprog_mark_for_release(). */ if (entry->parent->ref) { bpf_prog_put(entry->parent->ref); entry->parent->ref = NULL; } } static inline void bpf_mprog_revision_new(struct bpf_mprog_entry *entry) { atomic64_inc(&entry->parent->revision); } static inline void bpf_mprog_commit(struct bpf_mprog_entry *entry) { bpf_mprog_complete_release(entry); bpf_mprog_revision_new(entry); } static inline u64 bpf_mprog_revision(struct bpf_mprog_entry *entry) { return atomic64_read(&entry->parent->revision); } static inline void bpf_mprog_entry_copy(struct bpf_mprog_entry *dst, struct bpf_mprog_entry *src) { memcpy(dst->fp_items, src->fp_items, sizeof(src->fp_items)); } static inline void bpf_mprog_entry_clear(struct bpf_mprog_entry *dst) { memset(dst->fp_items, 0, sizeof(dst->fp_items)); } static inline void bpf_mprog_clear_all(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new) { struct bpf_mprog_entry *peer; peer = bpf_mprog_peer(entry); bpf_mprog_entry_clear(peer); peer->parent->count = 0; *entry_new = peer; } static inline void bpf_mprog_entry_grow(struct bpf_mprog_entry *entry, int idx) { int total = bpf_mprog_total(entry); memmove(entry->fp_items + idx + 1, entry->fp_items + idx, (total - idx) * sizeof(struct bpf_mprog_fp)); memmove(entry->parent->cp_items + idx + 1, entry->parent->cp_items + idx, (total - idx) * sizeof(struct bpf_mprog_cp)); } static inline void bpf_mprog_entry_shrink(struct bpf_mprog_entry *entry, int idx) { /* Total array size is needed in this case to enure the NULL * entry is copied at the end. */ int total = ARRAY_SIZE(entry->fp_items); memmove(entry->fp_items + idx, entry->fp_items + idx + 1, (total - idx - 1) * sizeof(struct bpf_mprog_fp)); memmove(entry->parent->cp_items + idx, entry->parent->cp_items + idx + 1, (total - idx - 1) * sizeof(struct bpf_mprog_cp)); } static inline void bpf_mprog_read(struct bpf_mprog_entry *entry, u32 idx, struct bpf_mprog_fp **fp, struct bpf_mprog_cp **cp) { *fp = &entry->fp_items[idx]; *cp = &entry->parent->cp_items[idx]; } static inline void bpf_mprog_write(struct bpf_mprog_fp *fp, struct bpf_mprog_cp *cp, struct bpf_tuple *tuple) { WRITE_ONCE(fp->prog, tuple->prog); cp->link = tuple->link; } int bpf_mprog_attach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog_new, struct bpf_link *link, struct bpf_prog *prog_old, u32 flags, u32 id_or_fd, u64 revision); int bpf_mprog_detach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog, struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision); int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_entry *entry); static inline bool bpf_mprog_supported(enum bpf_prog_type type) { switch (type) { case BPF_PROG_TYPE_SCHED_CLS: return true; default: return false; } } #endif /* __BPF_MPROG_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_LEGACY_H #define __X86_KERNEL_FPU_LEGACY_H #include <asm/fpu/types.h> extern unsigned int mxcsr_feature_mask; static inline void ldmxcsr(u32 mxcsr) { asm volatile("ldmxcsr %0" :: "m" (mxcsr)); } /* * Returns 0 on success or the trap number when the operation raises an * exception. */ #define user_insn(insn, output, input...) \ ({ \ int err; \ \ might_fault(); \ \ asm volatile(ASM_STAC "\n" \ "1: " #insn "\n" \ "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn_err(insn, output, input...) \ ({ \ int err; \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ : output : input) static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); else return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); } static inline void fxrstor(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int fxrstor_safe(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void frstor(struct fregs_state *fx) { kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int frstor_safe(struct fregs_state *fx) { return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void fxsave(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); else asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); } #endif
1415 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Definitions for mount interface. This describes the in the kernel build * linkedlist with mounted filesystems. * * Author: Marco van Wieringen <mvw@planets.elm.net> * */ #ifndef _LINUX_MOUNT_H #define _LINUX_MOUNT_H #include <linux/types.h> #include <asm/barrier.h> struct super_block; struct dentry; struct user_namespace; struct mnt_idmap; struct file_system_type; struct fs_context; struct file; struct path; #define MNT_NOSUID 0x01 #define MNT_NODEV 0x02 #define MNT_NOEXEC 0x04 #define MNT_NOATIME 0x08 #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 #define MNT_READONLY 0x40 /* does the user want this to be r/o? */ #define MNT_NOSYMFOLLOW 0x80 #define MNT_SHRINKABLE 0x100 #define MNT_WRITE_HOLD 0x200 #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ /* * MNT_SHARED_MASK is the set of flags that should be cleared when a * mount becomes shared. Currently, this is only the flag that says a * mount cannot be bind mounted, since this is how we create a mount * that shares events with another mount. If you add a new MNT_* * flag, consider how it interacts with shared mounts. */ #define MNT_SHARED_MASK (MNT_UNBINDABLE) #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ | MNT_READONLY | MNT_NOSYMFOLLOW) #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ MNT_CURSOR) #define MNT_INTERNAL 0x4000 #define MNT_LOCK_ATIME 0x040000 #define MNT_LOCK_NOEXEC 0x080000 #define MNT_LOCK_NOSUID 0x100000 #define MNT_LOCK_NODEV 0x200000 #define MNT_LOCK_READONLY 0x400000 #define MNT_LOCKED 0x800000 #define MNT_DOOMED 0x1000000 #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 #define MNT_CURSOR 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; struct mnt_idmap *mnt_idmap; } __randomize_layout; static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ return smp_load_acquire(&mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); extern bool __mnt_is_readonly(struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); int mnt_get_write_access(struct vfsmount *mnt); void mnt_put_write_access(struct vfsmount *mnt); extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); extern struct vfsmount *vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, const char *name, void *data); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern bool path_is_mountpoint(const struct path *path); extern bool our_mnt(struct vfsmount *mnt); extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern long do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); #endif /* _LINUX_MOUNT_H */
1184 1181 987 987 1184 200 200 200 200 987 987 987 987 987 987 987 987 987 987 987 987 987 987 987 985 39 1069 1069 140 140 140 1869 1869 1867 304 304 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 /* * mm/rmap.c - physical to virtual reverse mappings * * Copyright 2001, Rik van Riel <riel@conectiva.com.br> * Released under the General Public License (GPL). * * Simple, low overhead reverse mapping scheme. * Please try to keep this thing as modular as possible. * * Provides methods for unmapping each kind of mapped page: * the anon methods track anonymous pages, and * the file methods track pages belonging to an inode. * * Original design by Rik van Riel <riel@conectiva.com.br> 2001 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 * Contributions by Hugh Dickins 2003, 2004 */ /* * Lock ordering in mm: * * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) * page->flags PG_locked (lock_page) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * vma_start_write * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in block_dirty_folio) * folio_lock_memcg move_lock (in block_dirty_folio) * i_pages lock (widely used) * lruvec->lru_lock (in folio_lruvec_lock_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock * * hugetlbfs PageHuge() take locks in this order: * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * vma_lock (hugetlb specific lock for pmd_sharing) * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) * page->flags PG_locked (lock_page) */ #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> #include <linux/mm_inline.h> #include <asm/tlbflush.h> #define CREATE_TRACE_POINTS #include <trace/events/tlb.h> #include <trace/events/migrate.h> #include "internal.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { struct anon_vma *anon_vma; anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); anon_vma->num_children = 0; anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called * from fork, the root will be reset to the parents anon_vma. */ anon_vma->root = anon_vma; } return anon_vma; } static inline void anon_vma_free(struct anon_vma *anon_vma) { VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* * Synchronize against folio_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by * down_read_trylock() from folio_lock_anon_vma_read(). This orders: * * folio_lock_anon_vma_read() VS put_anon_vma() * down_read_trylock() atomic_dec_and_test() * LOCK MB * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); } kmem_cache_free(anon_vma_cachep, anon_vma); } static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) { return kmem_cache_alloc(anon_vma_chain_cachep, gfp); } static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) { kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } static void anon_vma_chain_link(struct vm_area_struct *vma, struct anon_vma_chain *avc, struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * * The common case will be that we already have one, which * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we * allocate a new one. * * Anon-vma allocations are very subtle, because we may have * optimistically looked up an anon_vma in folio_lock_anon_vma_read() * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). * * As a result, we need to do proper anon_vma locking even * for the new allocation. At the same time, we do not want * to do any locking for the common case of already having * an anon_vma. * * This must be called with the mmap_lock held for reading. */ int __anon_vma_prepare(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; might_sleep(); avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_enomem; anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } spin_unlock(&mm->page_table_lock); anon_vma_unlock_write(anon_vma); if (unlikely(allocated)) put_anon_vma(allocated); if (unlikely(avc)) anon_vma_chain_free(avc); return 0; out_enomem_free_avc: anon_vma_chain_free(avc); out_enomem: return -ENOMEM; } /* * This is a useful helper function for locking the anon_vma root as * we traverse the vma->anon_vma_chain, looping over anon_vma's that * have the same vma. * * Such anon_vma's should have the same root, so you'd expect to see * just a single mutex_lock for the whole traversal. */ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) { struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) up_write(&root->rwsem); root = new_root; down_write(&root->rwsem); } return root; } static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) up_write(&root->rwsem); } /* * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before * call, we can identify this case by checking (!dst->anon_vma && * src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. * This prevents degradation of anon_vma hierarchy to endless linear chain in * case of constantly forking task. On the other hand, an anon_vma with more * than one child isn't reused even if there was no alive vma, thus rmap * walker has a good chance of avoiding scanning the whole hierarchy when it * searches where page is mapped. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!avc)) { unlock_anon_vma_root(root); root = NULL; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto enomem_failure; } anon_vma = pavc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_chain_link(dst, avc, anon_vma); /* * Reuse existing anon_vma if it has no vma and only one * anon_vma child. * * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ if (!dst->anon_vma && src->anon_vma && anon_vma->num_children < 2 && anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; enomem_failure: /* * dst->anon_vma is dropped here otherwise its num_active_vmas can * be incorrectly decremented in unlink_anon_vmas(). * We can safely do this because callers of anon_vma_clone() don't care * about dst->anon_vma if anon_vma_clone() failed. */ dst->anon_vma = NULL; unlink_anon_vmas(dst); return -ENOMEM; } /* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. */ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; int error; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) return 0; /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma->anon_vma = NULL; /* * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ error = anon_vma_clone(vma, pvma); if (error) return error; /* An existing anon_vma has been reused, all done then. */ if (vma->anon_vma) return 0; /* Then add our own anon_vma. */ anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; /* * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; anon_vma->parent = pvma->anon_vma; /* * With refcounts, an anon_vma can stay around longer than the * process it belongs to. The root anon_vma needs to be pinned until * this anon_vma is freed, because the lock lives in the root. */ get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; out_error_free_anon_vma: put_anon_vma(anon_vma); out_error: unlink_anon_vmas(vma); return -ENOMEM; } void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; struct anon_vma *root = NULL; /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { anon_vma->parent->num_children--; continue; } list_del(&avc->same_vma); anon_vma_chain_free(avc); } if (vma->anon_vma) { vma->anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared * when handle fault. */ vma->anon_vma = NULL; } unlock_anon_vma_root(root); /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() * needing to write-acquire the anon_vma->root->rwsem. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; VM_WARN_ON(anon_vma->num_children); VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); anon_vma_chain_free(avc); } } static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT_CACHED; } void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC|SLAB_ACCOUNT); } /* * Getting a lock on a stable anon_vma from a page off the LRU is tricky! * * Since there is no serialization what so ever against page_remove_rmap() * the best this function can do is return a refcount increased anon_vma * that might have been relevant to this page. * * The page might have been remapped to a different anon_vma or the anon_vma * returned may already be freed (and even reused). * * In case it was remapped to a different anon_vma, the new anon_vma will be a * child of the old anon_vma, and the anon_vma lifetime rules will therefore * ensure that any anon_vma obtained from the page will still be valid for as * long as we observe page_mapped() [ hence all those page_mapped() tests ]. * * All users of this function must be very careful when walking the anon_vma * chain and verify that the page in question is indeed mapped in it * [ something equivalent to page_mapped_in_vma() ]. * * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from * page_remove_rmap() that the anon_vma pointer from page->mapping is valid * if there is a mapcount, we can dereference the anon_vma after observing * those. */ struct anon_vma *folio_get_anon_vma(struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } /* * If this folio is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; } out: rcu_read_unlock(); return anon_vma; } /* * Similar to folio_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; unsigned long anon_mapping; rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the folio is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!folio_mapped(folio)) { up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; } if (rwc && rwc->try_lock) { anon_vma = NULL; rwc->contended = true; goto out; } /* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; } /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); anon_vma_lock_read(anon_vma); if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because * we'll deadlock on the anon_vma_lock_write() recursion. */ anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } return anon_vma; out: rcu_read_unlock(); return anon_vma; } #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is * important if a PTE was dirty when it was unmapped that it's flushed * before any IO is initiated on the page to prevent lost writes. Similarly, * it must be flushed before freeing to prevent data leakage. */ void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; if (!tlb_ubc->flush_required) return; arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; } /* Flush iff there are potentially writable TLB entries that can race with IO */ void try_to_unmap_flush_dirty(void) { struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; if (tlb_ubc->writable) try_to_unmap_flush(); } /* * Bits 0-14 of mm->tlb_flush_batched record pending generations. * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. */ #define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 #define TLB_FLUSH_BATCH_PENDING_MASK \ ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) #define TLB_FLUSH_BATCH_PENDING_LARGE \ (TLB_FLUSH_BATCH_PENDING_MASK / 2) static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, unsigned long uaddr) { struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; int batch; bool writable = pte_dirty(pteval); if (!pte_accessible(mm, pteval)) return; arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr); tlb_ubc->flush_required = true; /* * Ensure compiler does not re-order the setting of tlb_flush_batched * before the PTE is cleared. */ barrier(); batch = atomic_read(&mm->tlb_flush_batched); retry: if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { /* * Prevent `pending' from catching up with `flushed' because of * overflow. Reset `pending' and `flushed' to be 1 and 0 if * `pending' becomes large. */ if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1)) goto retry; } else { atomic_inc(&mm->tlb_flush_batched); } /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() * before the page is queued for IO. */ if (writable) tlb_ubc->writable = true; } /* * Returns true if the TLB flush should be deferred to the end of a batch of * unmap operations to reduce IPIs. */ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { if (!(flags & TTU_BATCH_FLUSH)) return false; return arch_tlbbatch_should_defer(mm); } /* * Reclaim unmaps pages under the PTL but do not flush the TLB prior to * releasing the PTL if TLB flushes are batched. It's possible for a parallel * operation such as mprotect or munmap to race between reclaim unmapping * the page and flushing the page. If this race occurs, it potentially allows * access to data via a stale TLB entry. Tracking all mm's that have TLB * batching in flight would be expensive during reclaim so instead track * whether TLB batching occurred in the past and if so then do a flush here * if required. This will cost one additional flush per reclaim cycle paid * by the first operation at risk such as mprotect and mumap. * * This must be called under the PTL so that an access to tlb_flush_batched * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise * via the PTL. */ void flush_tlb_batched_pending(struct mm_struct *mm) { int batch = atomic_read(&mm->tlb_flush_batched); int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { arch_flush_tlb_batched_pending(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. */ atomic_cmpxchg(&mm->tlb_flush_batched, batch, pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, unsigned long uaddr) { } static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { return false; } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ /* * At what user virtual address is page expected in vma? * Caller should check the page is actually part of the vma. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. */ if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; } else if (vma->vm_file->f_mapping != folio->mapping) { return -EFAULT; } return vma_address(page, vma); } /* * Returns the actual pmd_t* where we expect 'address' to be mapped from, or * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* * represents. */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) goto out; pud = pud_offset(p4d, address); if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); out: return pmd; } struct folio_referenced_arg { int mapcount; int referenced; unsigned long vm_flags; struct mem_cgroup *memcg; }; /* * arg: folio_referenced_arg will be passed */ static bool folio_referenced_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int referenced = 0; unsigned long start = address, ptes = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; if (vma->vm_flags & VM_LOCKED) { if (!folio_test_large(folio) || !pvmw.pte) { /* Restore the mlock which got missed */ mlock_vma_folio(folio, vma); page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ } /* * For large folio fully mapped to VMA, will * be handled after the pvmw loop. * * For large folio cross VMA boundaries, it's * expected to be picked by page reclaim. But * should skip reference of pages which are in * the range of VM_LOCKED vma. As page reclaim * should just count the reference of pages out * the range of VM_LOCKED vma. */ ptes++; pra->mapcount--; continue; } if (pvmw.pte) { if (lru_gen_enabled() && pte_young(ptep_get(pvmw.pte))) { lru_gen_look_around(&pvmw); referenced++; } if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) referenced++; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) referenced++; } else { /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); } pra->mapcount--; } if ((vma->vm_flags & VM_LOCKED) && folio_test_large(folio) && folio_within_vma(folio, vma)) { unsigned long s_align, e_align; s_align = ALIGN_DOWN(start, PMD_SIZE); e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE); /* folio doesn't cross page table boundary and fully mapped */ if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) { /* Restore the mlock which got missed */ mlock_vma_folio(folio, vma); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ } } if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) referenced++; if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; } if (!pra->mapcount) return false; /* To break the loop */ return true; } static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) { struct folio_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; /* * Ignore references from this mapping if it has no recency. If the * folio has been used in another mapping, we will catch it; if this * other mapping is already gone, the unmap path will have set the * referenced flag or activated the folio in zap_pte_range(). */ if (!vma_has_recency(vma)) return true; /* * If we are reclaiming on behalf of a cgroup, skip counting on behalf * of references from different cgroups. */ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) return true; return false; } /** * folio_referenced() - Test if the folio was referenced. * @folio: The folio to test. * @is_locked: Caller holds lock on the folio. * @memcg: target memory cgroup * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. * * Quick test_and_clear_referenced for all mappings of a folio, * * Return: The number of mappings which referenced the folio. Return -1 if * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) { int we_locked = 0; struct folio_referenced_arg pra = { .mapcount = folio_mapcount(folio), .memcg = memcg, }; struct rmap_walk_control rwc = { .rmap_one = folio_referenced_one, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, .try_lock = true, .invalid_vma = invalid_folio_referenced_vma, }; *vm_flags = 0; if (!pra.mapcount) return 0; if (!folio_raw_mapping(folio)) return 0; if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { we_locked = folio_trylock(folio); if (!we_locked) return 1; } rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; if (we_locked) folio_unlock(folio); return rwc.contended ? -1 : pra.referenced; } static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) { int cleaned = 0; struct vm_area_struct *vma = pvmw->vma; struct mmu_notifier_range range; unsigned long address = pvmw->address; /* * We have to assume the worse case ie pmd for invalidation. Note that * the folio can not be freed from this function. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma->vm_mm, address, vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(pvmw)) { int ret = 0; address = pvmw->address; if (pvmw->pte) { pte_t *pte = pvmw->pte; pte_t entry = ptep_get(pte); if (!pte_dirty(entry) && !pte_write(entry)) continue; flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw->pmd; pmd_t entry; if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) continue; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); ret = 1; #else /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); #endif } if (ret) cleaned++; } mmu_notifier_invalidate_range_end(&range); return cleaned; } static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); int *cleaned = arg; *cleaned += page_vma_mkclean_one(&pvmw); return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { if (vma->vm_flags & VM_SHARED) return false; return true; } int folio_mkclean(struct folio *folio) { int cleaned = 0; struct address_space *mapping; struct rmap_walk_control rwc = { .arg = (void *)&cleaned, .rmap_one = page_mkclean_one, .invalid_vma = invalid_mkclean_vma, }; BUG_ON(!folio_test_locked(folio)); if (!folio_mapped(folio)) return 0; mapping = folio_mapping(folio); if (!mapping) return 0; rmap_walk(folio, &rwc); return cleaned; } EXPORT_SYMBOL_GPL(folio_mkclean); /** * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) * within the @vma of shared mappings. And since clean PTEs * should also be readonly, write protects them too. * @pfn: start pfn. * @nr_pages: number of physically contiguous pages srarting with @pfn. * @pgoff: page offset that the @pfn mapped with. * @vma: vma that @pfn mapped within. * * Returns the number of cleaned PTEs (including PMDs). */ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma) { struct page_vma_mapped_walk pvmw = { .pfn = pfn, .nr_pages = nr_pages, .pgoff = pgoff, .vma = vma, .flags = PVMW_SYNC, }; if (invalid_mkclean_vma(vma, NULL)) return 0; pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma); VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); return page_vma_mkclean_one(&pvmw); } int folio_total_mapcount(struct folio *folio) { int mapcount = folio_entire_mapcount(folio); int nr_pages; int i; /* In the common case, avoid the loop when no pages mapped by PTE */ if (folio_nr_pages_mapped(folio) == 0) return mapcount; /* * Add all the PTE mappings of those pages mapped by PTE. * Limit the loop to folio_nr_pages_mapped()? * Perhaps: given all the raciness, that may be a good or a bad idea. */ nr_pages = folio_nr_pages(folio); for (i = 0; i < nr_pages; i++) mapcount += atomic_read(&folio_page(folio, i)->_mapcount); /* But each of those _mapcounts was based on -1 */ mapcount += nr_pages; return mapcount; } /** * folio_move_anon_rmap - move a folio to our anon_vma * @folio: The folio to move to our anon_vma * @vma: The vma the folio belongs to * * When a folio belongs exclusively to one process after a COW event, * that folio can be moved into the anon_vma that belongs to just that * process, so the rmap code will not search the parent or sibling processes. */ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { void *anon_vma = vma->anon_vma; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); anon_vma += PAGE_MAPPING_ANON; /* * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(folio->mapping, anon_vma); } /** * __folio_set_anon - set up a new anonymous rmap for a folio * @folio: The folio to set up the new anonymous rmap for. * @vma: VM area to add the folio to. * @address: User virtual address of the mapping * @exclusive: Whether the folio is exclusive to the process. */ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, unsigned long address, bool exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); /* * If the folio isn't exclusive to this vma, we must use the _oldest_ * possible anon_vma for the folio mapping! */ if (!exclusive) anon_vma = anon_vma->root; /* * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); } /** * __page_check_anon_rmap - sanity check anonymous rmap addition * @folio: The folio containing @page. * @page: the page to check the mapping of * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ static void __page_check_anon_rmap(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address) { /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. * * We have exclusion against page_add_anon_rmap because the caller * always holds the page locked. * * We have exclusion against page_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, folio); VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), page); } /** * page_add_anon_rmap - add pte mapping to an anonymous page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * @flags: the rmap flags * * The caller needs to hold the pte lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, * and to ensure that PageAnon is not being upgraded racily to PageKsm * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { struct folio *folio = page_folio(page); atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; bool first; /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; if (first && folio_test_large(folio)) { nr = atomic_inc_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; } else { /* Raced ahead of a remove of COMPOUND_MAPPED */ nr = 0; } } } if (nr_pmdmapped) __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); if (nr) __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); if (unlikely(!folio_test_anon(folio))) { VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); /* * For a PTE-mapped large folio, we only know that the single * PTE is exclusive. Further, __folio_set_anon() might not get * folio->index right when not given the address of the head * page. */ VM_WARN_ON_FOLIO(folio_test_large(folio) && !compound, folio); __folio_set_anon(folio, vma, address, !!(flags & RMAP_EXCLUSIVE)); } else if (likely(!folio_test_ksm(folio))) { __page_check_anon_rmap(folio, page, vma, address); } if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); /* While PTE-mapping a THP we have a PMD and a PTE mapping. */ VM_WARN_ON_FOLIO((atomic_read(&page->_mapcount) > 0 || (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) && PageAnonExclusive(page), folio); /* * For large folio, only mlock it if it's fully mapped to VMA. It's * not easy to check whether the large folio is fully mapped to VMA * here. Only mlock normal 4K folio and leave page reclaim to handle * large folio. */ if (!folio_test_large(folio)) mlock_vma_folio(folio, vma); } /** * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * * Like page_add_anon_rmap() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. * The folio does not have to be locked. * * If the folio is large, it is accounted as a THP. As the folio * is new, it's assumed to be mapped exclusively by a single process. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { int nr; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); __folio_set_swapbacked(folio); if (likely(!folio_test_pmd_mappable(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); nr = 1; } else { /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED); nr = folio_nr_pages(folio); __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); } __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); __folio_set_anon(folio, vma, address, true); SetPageAnonExclusive(&folio->page); } /** * folio_add_file_rmap_range - add pte mapping to page range of a folio * @folio: The folio to add the mapping to * @page: The first page to add * @nr_pages: The number of pages which will be mapped * @vma: the vm area in which the mapping is added * @compound: charge the page as compound or small page * * The page range of folio is defined by [first_page, first_page + nr_pages) * * The caller needs to hold the pte lock. */ void folio_add_file_rmap_range(struct folio *folio, struct page *page, unsigned int nr_pages, struct vm_area_struct *vma, bool compound) { atomic_t *mapped = &folio->_nr_pages_mapped; unsigned int nr_pmdmapped = 0, first; int nr = 0; VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio); /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { do { first = atomic_inc_and_test(&page->_mapcount); if (first && folio_test_large(folio)) { first = atomic_inc_return_relaxed(mapped); first = (first < COMPOUND_MAPPED); } if (first) nr++; } while (page++, --nr_pages > 0); } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; } else { /* Raced ahead of a remove of COMPOUND_MAPPED */ nr = 0; } } } if (nr_pmdmapped) __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); /* See comments in page_add_anon_rmap() */ if (!folio_test_large(folio)) mlock_vma_folio(folio, vma); } /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { struct folio *folio = page_folio(page); unsigned int nr_pages; VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page); if (likely(!compound)) nr_pages = 1; else nr_pages = folio_nr_pages(folio); folio_add_file_rmap_range(folio, page, nr_pages, vma, compound); } /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from * @vma: the vm area from which the mapping is removed * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { struct folio *folio = page_folio(page); atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool last; enum node_stat_item idx; VM_BUG_ON_PAGE(compound && !PageHead(page), page); /* Hugetlb pages are not counted in NR_*MAPPED */ if (unlikely(folio_test_hugetlb(folio))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(&folio->_entire_mapcount); return; } /* Is page being unmapped by PTE? Is this its last map to be removed? */ if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); nr = last; if (last && folio_test_large(folio)) { nr = atomic_dec_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED)) { nr_pmdmapped = folio_nr_pages(folio); nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; } else { /* An add of COMPOUND_MAPPED raced ahead */ nr = 0; } } } if (nr_pmdmapped) { if (folio_test_anon(folio)) idx = NR_ANON_THPS; else if (folio_test_swapbacked(folio)) idx = NR_SHMEM_PMDMAPPED; else idx = NR_FILE_PMDMAPPED; __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); } if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; __lruvec_stat_mod_folio(folio, idx, -nr); /* * Queue anon THP for deferred split if at least one * page of the folio is unmapped and at least one page * is still mapped. */ if (folio_test_pmd_mappable(folio) && folio_test_anon(folio)) if (!compound || nr < nr_pmdmapped) deferred_split_folio(folio); } /* * It would be tidy to reset folio_test_anon mapping when fully * unmapped, but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping before us: * so leave the reset to free_pages_prepare, and remember that * it's only reliable while mapped. */ munlock_vma_folio(folio, vma); } /* * @arg: enum ttu_flags will be passed to this argument */ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; unsigned long hsz = 0; /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and page_remove_rmap(), * try_to_unmap() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; if (flags & TTU_SPLIT_HUGE_PMD) split_huge_pmd_address(vma, address, false, folio); /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the folio can not be freed in this function as call of * try_to_unmap() must hold a reference on the folio. */ range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* We need the huge page size for set_huge_pte_at() */ hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); /* * If the folio is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Restore the mlock which got missed */ if (!folio_test_large(folio)) mlock_vma_folio(folio, vma); page_vma_mapped_walk_done(&pvmw); ret = false; break; } pfn = pte_pfn(ptep_get(pvmw.pte)); subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { bool anon = folio_test_anon(folio); /* * The try_to_unmap() is only passed a hugetlb page * in the case where the hugetlb page is poisoned. */ VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may * be cached for this mm, so we must flush them all. * start/end were already adjusted above to cover this * range. */ flush_cache_range(vma, range.start, range.end); /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. * * We also must hold hugetlb vma_lock in write mode. * Lock order dictates acquiring vma_lock BEFORE * i_mmap_rwsem. We can only try lock here and fail * if unsuccessful. */ if (!anon) { VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map * counting is done for shared PMDs. * Return 'true' here. When there is * no other sharing, huge_pmd_unshare * returns false and we will unmap the * actual page and drop map count * to zero. */ page_vma_mapped_walk_done(&pvmw); break; } hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } } /* * Now the pte is cleared. If this pte was uffd-wp armed, * we may want to replace a none pte with a marker pte if * it's file-backed, so we don't lose the tracking info. */ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) folio_mark_dirty(folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); } } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. * A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. */ dec_mm_counter(mm, mm_counter(&folio->page)); } else if (folio_test_anon(folio)) { swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; /* * Store the swap location in the pte. * See handle_pte_fault() ... */ if (unlikely(folio_test_swapbacked(folio) != folio_test_swapcache(folio))) { WARN_ON_ONCE(1); ret = false; page_vma_mapped_walk_done(&pvmw); break; } /* MADV_FREE page check */ if (!folio_test_swapbacked(folio)) { int ref_count, map_count; /* * Synchronize with gup_pte_range(): * - clear PTE; barrier; read refcount * - inc refcount; barrier; read PTE */ smp_mb(); ref_count = folio_ref_count(folio); map_count = folio_mapcount(folio); /* * Order reads for page refcount and dirty flag * (see comments in __remove_mapping()). */ smp_rmb(); /* * The only page refs must be one from isolation * plus the rmap(s) (dropped by discard:). */ if (ref_count == 1 + map_count && !folio_test_dirty(folio)) { dec_mm_counter(mm, MM_ANONPAGES); goto discard; } /* * If the folio was redirtied, it cannot be * discarded. Remap the page to page table. */ set_pte_at(mm, address, pvmw.pte, pteval); folio_set_swapbacked(folio); ret = false; page_vma_mapped_walk_done(&pvmw); break; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } if (arch_unmap_one(mm, vma, address, pteval) < 0) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } /* See page_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && page_try_share_anon_rmap(subpage)) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (anon_exclusive) swp_pte = pte_swp_mkexclusive(swp_pte); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); } else { /* * This is a locked file-backed folio, * so it cannot be removed from the page * cache and replaced by a new folio before * mmu_notifier_invalidate_range_end, so no * concurrent thread might update its page table * to point at a new folio while a device is * still using this folio. * * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(&folio->page)); } discard: page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); } mmu_notifier_invalidate_range_end(&range); return ret; } static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) { return vma_is_temporary_stack(vma); } static int folio_not_mapped(struct folio *folio) { return !folio_mapped(folio); } /** * try_to_unmap - Try to remove all page table mappings to a folio. * @folio: The folio to unmap. * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * folio. It is the caller's responsibility to check if the folio is * still mapped if needed (use TTU_SYNC to prevent accounting races). * * Context: Caller must hold the folio lock. */ void try_to_unmap(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(folio, &rwc); else rmap_walk(folio, &rwc); } /* * @arg: enum ttu_flags will be passed to this argument. * * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs * containing migration entries. */ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); pte_t pteval; struct page *subpage; bool anon_exclusive, ret = true; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; unsigned long hsz = 0; /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and page_remove_rmap(), * try_to_migrate() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; /* * unmap_page() in mm/huge_memory.c is the only user of migration with * TTU_SPLIT_HUGE_PMD and it wants to freeze. */ if (flags & TTU_SPLIT_HUGE_PMD) split_huge_pmd_address(vma, address, true, folio); /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* We need the huge page size for set_huge_pte_at() */ hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { subpage = folio_page(folio, pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); if (set_pmd_migration_entry(&pvmw, subpage)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } continue; } #endif /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); pfn = pte_pfn(ptep_get(pvmw.pte)); if (folio_is_zone_device(folio)) { /* * Our PTE is a non-present device exclusive entry and * calculating the subpage as for the common case would * result in an invalid pointer. * * Since only PAGE_SIZE pages can currently be * migrated, just set it to page. This will need to be * changed when hugepage migrations to device private * memory are supported. */ VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); subpage = &folio->page; } else { subpage = folio_page(folio, pfn - folio_pfn(folio)); } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { bool anon = folio_test_anon(folio); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may * be cached for this mm, so we must flush them all. * start/end were already adjusted above to cover this * range. */ flush_cache_range(vma, range.start, range.end); /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. * * We also must hold hugetlb vma_lock in write mode. * Lock order dictates acquiring vma_lock BEFORE * i_mmap_rwsem. We can only try lock here and * fail if unsuccessful. */ if (!anon) { VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map * counting is done for shared PMDs. * Return 'true' here. When there is * no other sharing, huge_pmd_unshare * returns false and we will unmap the * actual page and drop map count * to zero. */ page_vma_mapped_walk_done(&pvmw); break; } hugetlb_vma_unlock_write(vma); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } } /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) folio_mark_dirty(folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); if (folio_is_device_private(folio)) { unsigned long pfn = folio_pfn(folio); swp_entry_t entry; pte_t swp_pte; if (anon_exclusive) BUG_ON(page_try_share_anon_rmap(subpage)); /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ entry = pte_to_swp_entry(pteval); if (is_writable_device_private_entry(entry)) entry = make_writable_migration_entry(pfn); else if (anon_exclusive) entry = make_readable_exclusive_migration_entry(pfn); else entry = make_readable_migration_entry(pfn); swp_pte = swp_entry_to_pte(entry); /* * pteval maps a zone device page and is therefore * a swap pte. */ if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); trace_set_migration_pte(pvmw.address, pte_val(swp_pte), compound_order(&folio->page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. */ } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); } } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. * A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. */ dec_mm_counter(mm, mm_counter(&folio->page)); } else { swp_entry_t entry; pte_t swp_pte; if (arch_unmap_one(mm, vma, address, pteval) < 0) { if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); else set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && !anon_exclusive, subpage); /* See page_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && page_try_share_anon_rmap(subpage)) { if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); else set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ if (pte_write(pteval)) entry = make_writable_migration_entry( page_to_pfn(subpage)); else if (anon_exclusive) entry = make_readable_exclusive_migration_entry( page_to_pfn(subpage)); else entry = make_readable_migration_entry( page_to_pfn(subpage)); if (pte_young(pteval)) entry = make_migration_entry_young(entry); if (pte_dirty(pteval)) entry = make_migration_entry_dirty(entry); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, swp_pte, hsz); else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), compound_order(&folio->page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. */ } page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); } mmu_notifier_invalidate_range_end(&range); return ret; } /** * try_to_migrate - try to replace all page table mappings with swap entries * @folio: the folio to replace page table entries for * @flags: action and flags * * Tries to remove all the page table entries which are mapping this folio and * replace them with special swap entries. Caller must hold the folio lock. */ void try_to_migrate(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, .arg = (void *)flags, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; /* * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags. */ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC | TTU_BATCH_FLUSH))) return; if (folio_is_zone_device(folio) && (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) return; /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the * page tables leading to a race where migration cannot * find the migration ptes. Rather than increasing the * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ if (!folio_test_ksm(folio) && folio_test_anon(folio)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(folio, &rwc); else rmap_walk(folio, &rwc); } #ifdef CONFIG_DEVICE_PRIVATE struct make_exclusive_args { struct mm_struct *mm; unsigned long address; void *owner; bool valid; }; static bool page_make_device_exclusive_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *priv) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); struct make_exclusive_args *args = priv; pte_t pteval; struct page *subpage; bool ret = true; struct mmu_notifier_range range; swp_entry_t entry; pte_t swp_pte; pte_t ptent; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, address + folio_size(folio)), args->owner); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); ptent = ptep_get(pvmw.pte); if (!pte_present(ptent)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } subpage = folio_page(folio, pte_pfn(ptent) - folio_pfn(folio)); address = pvmw.address; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(ptent)); pteval = ptep_clear_flush(vma, address, pvmw.pte); /* Set the dirty flag on the folio now the pte is gone. */ if (pte_dirty(pteval)) folio_mark_dirty(folio); /* * Check that our target page is still mapped at the expected * address. */ if (args->mm == mm && args->address == address && pte_write(pteval)) args->valid = true; /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ if (pte_write(pteval)) entry = make_writable_device_exclusive_entry( page_to_pfn(subpage)); else entry = make_readable_device_exclusive_entry( page_to_pfn(subpage)); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* * There is a reference on the page for the swap entry which has * been removed, so shouldn't take another. */ page_remove_rmap(subpage, vma, false); } mmu_notifier_invalidate_range_end(&range); return ret; } /** * folio_make_device_exclusive - Mark the folio exclusively owned by a device. * @folio: The folio to replace page table entries for. * @mm: The mm_struct where the folio is expected to be mapped. * @address: Address where the folio is expected to be mapped. * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks * * Tries to remove all the page table entries which are mapping this * folio and replace them with special device exclusive swap entries to * grant a device exclusive access to the folio. * * Context: Caller must hold the folio lock. * Return: false if the page is still mapped, or if it could not be unmapped * from the expected address. Otherwise returns true (success). */ static bool folio_make_device_exclusive(struct folio *folio, struct mm_struct *mm, unsigned long address, void *owner) { struct make_exclusive_args args = { .mm = mm, .address = address, .owner = owner, .valid = false, }; struct rmap_walk_control rwc = { .rmap_one = page_make_device_exclusive_one, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, .arg = &args, }; /* * Restrict to anonymous folios for now to avoid potential writeback * issues. */ if (!folio_test_anon(folio)) return false; rmap_walk(folio, &rwc); return args.valid && !folio_mapcount(folio); } /** * make_device_exclusive_range() - Mark a range for exclusive use by a device * @mm: mm_struct of associated target process * @start: start of the region to mark for exclusive device access * @end: end address of region * @pages: returns the pages which were successfully marked for exclusive access * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering * * Returns: number of pages found in the range by GUP. A page is marked for * exclusive access only if the page pointer is non-NULL. * * This function finds ptes mapping page(s) to the given address range, locks * them and replaces mappings with special swap entries preventing userspace CPU * access. On fault these entries are replaced with the original mapping after * calling MMU notifiers. * * A driver using this to program access from a device must use a mmu notifier * critical section to hold a device specific lock during programming. Once * programming is complete it should drop the page lock and reference after * which point CPU access to the page will revoke the exclusive access. */ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct page **pages, void *owner) { long npages = (end - start) >> PAGE_SHIFT; long i; npages = get_user_pages_remote(mm, start, npages, FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, pages, NULL); if (npages < 0) return npages; for (i = 0; i < npages; i++, start += PAGE_SIZE) { struct folio *folio = page_folio(pages[i]); if (PageTail(pages[i]) || !folio_trylock(folio)) { folio_put(folio); pages[i] = NULL; continue; } if (!folio_make_device_exclusive(folio, mm, start, owner)) { folio_unlock(folio); folio_put(folio); pages[i] = NULL; } } return npages; } EXPORT_SYMBOL_GPL(make_device_exclusive_range); #endif void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; anon_vma_free(anon_vma); if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); } static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; if (rwc->anon_lock) return rwc->anon_lock(folio, rwc); /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ anon_vma = folio_anon_vma(folio); if (!anon_vma) return NULL; if (anon_vma_trylock_read(anon_vma)) goto out; if (rwc->try_lock) { anon_vma = NULL; rwc->contended = true; goto out; } anon_vma_lock_read(anon_vma); out: return anon_vma; } /* * rmap_walk_anon - do something to anonymous page using the object-based * rmap method * @folio: the folio to be handled * @rwc: control variable according to each walk type * @locked: caller holds relevant rmap lock * * Find all the mappings of a folio using the mapping pointer and the vma * chains contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; if (locked) { anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ VM_BUG_ON_FOLIO(!anon_vma, folio); } else { anon_vma = rmap_walk_anon_lock(folio, rwc); } if (!anon_vma) return; pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(&folio->page, vma); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(folio)) break; } if (!locked) anon_vma_unlock_read(anon_vma); } /* * rmap_walk_file - do something to file page using the object-based rmap method * @folio: the folio to be handled * @rwc: control variable according to each walk type * @locked: caller holds relevant rmap lock * * Find all the mappings of a folio using the mapping pointer and the vma chains * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = folio_mapping(folio); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; /* * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, * so we can safely take mapping->i_mmap_rwsem. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!mapping) return; pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; if (!locked) { if (i_mmap_trylock_read(mapping)) goto lookup; if (rwc->try_lock) { rwc->contended = true; return; } i_mmap_lock_read(mapping); } lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(&folio->page, vma); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(folio)) goto done; } done: if (!locked) i_mmap_unlock_read(mapping); } void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); else if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, false); else rmap_walk_file(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, true); else rmap_walk_file(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE /* * The following two functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. * * RMAP_COMPOUND is ignored. */ void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && PageAnonExclusive(&folio->page), folio); } void hugepage_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); __folio_set_anon(folio, vma, address, true); SetPageAnonExclusive(&folio->page); } #endif /* CONFIG_HUGETLB_PAGE */
9 9 9 3 3 3 3 11 10 9 8 1 8 7 7 7 7 7 6 11 2 1 2 1 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/ioctls.h> #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include "l2tp_core.h" struct l2tp_ip_sock { /* inet_sock has to be the first member of l2tp_ip_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; }; static DEFINE_RWLOCK(l2tp_ip_lock); static struct hlist_head l2tp_ip_table; static struct hlist_head l2tp_ip_bind_table; static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) { return (struct l2tp_ip_sock *)sk; } static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, __be32 raddr, int dif, u32 tunnel_id) { struct sock *sk; sk_for_each_bound(sk, &l2tp_ip_bind_table) { const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); const struct inet_sock *inet = inet_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (inet->inet_rcv_saddr && laddr && inet->inet_rcv_saddr != laddr) continue; if (inet->inet_daddr && raddr && inet->inet_daddr != raddr) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct iphdr *iph; if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_session_get(net, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = (struct iphdr *)skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, inet_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&l2tp_ip_lock); goto discard; } sock_hold(sk); read_unlock_bh(&l2tp_ip_lock); if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_dec_refcount(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip_hash(struct sock *sk) { if (sk_unhashed(sk)) { write_lock_bh(&l2tp_ip_lock); sk_add_node(sk, &l2tp_ip_table); write_unlock_bh(&l2tp_ip_lock); } return 0; } static void l2tp_ip_unhash(struct sock *sk) { if (sk_unhashed(sk)) return; write_lock_bh(&l2tp_ip_lock); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); } static int l2tp_ip_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip_hash(sk); return 0; } static void l2tp_ip_close(struct sock *sk, long timeout) { write_lock_bh(&l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); sk_common_release(sk); } static void l2tp_ip_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); if (tunnel) l2tp_tunnel_delete(tunnel); } static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; struct net *net = sock_net(sk); int ret; int chk_addr_ret; if (addr_len < sizeof(struct sockaddr_l2tpip)) return -EINVAL; if (addr->l2tp_family != AF_INET) return -EINVAL; lock_sock(sk); ret = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (sk->sk_state != TCP_CLOSE) goto out; chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); ret = -EADDRNOTAVAIL; if (addr->l2tp_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; if (addr->l2tp_addr.s_addr) { inet->inet_rcv_saddr = addr->l2tp_addr.s_addr; inet->inet_saddr = addr->l2tp_addr.s_addr; } if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ write_lock_bh(&l2tp_ip_lock); if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0, sk->sk_bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&l2tp_ip_lock); ret = -EADDRINUSE; goto out; } sk_dst_reset(sk); l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &l2tp_ip_bind_table); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); ret = 0; sock_reset_flag(sk, SOCK_ZAPPED); out: release_sock(sk); return ret; } static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; int rc; if (addr_len < sizeof(*lsa)) return -EINVAL; if (ipv4_is_multicast(lsa->l2tp_addr.s_addr)) return -EINVAL; lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip4_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; write_lock_bh(&l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &l2tp_ip_bind_table); write_unlock_bh(&l2tp_ip_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk); struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; memset(lsa, 0, sizeof(*lsa)); lsa->l2tp_family = AF_INET; if (peer) { if (!inet->inet_dport) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr.s_addr = inet->inet_daddr; } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; lsa->l2tp_conn_id = lsk->conn_id; lsa->l2tp_addr.s_addr = addr; } return sizeof(*lsa); } static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return 0; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct sk_buff *skb; int rc; struct inet_sock *inet = inet_sk(sk); struct rtable *rt = NULL; struct flowi4 *fl4; int connected = 0; __be32 daddr; lock_sock(sk); rc = -ENOTCONN; if (sock_flag(sk, SOCK_DEAD)) goto out; /* Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name); rc = -EINVAL; if (msg->msg_namelen < sizeof(*lip)) goto out; if (lip->l2tp_family != AF_INET) { rc = -EAFNOSUPPORT; if (lip->l2tp_family != AF_UNSPEC) goto out; } daddr = lip->l2tp_addr.s_addr; } else { rc = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; daddr = inet->inet_daddr; connected = 1; } /* Allocate a socket buffer */ rc = -ENOMEM; skb = sock_wmalloc(sk, 2 + NET_SKB_PAD + sizeof(struct iphdr) + 4 + len, 0, GFP_KERNEL); if (!skb) goto error; /* Reserve space for headers, putting IP header on 4-byte boundary. */ skb_reserve(skb, 2 + NET_SKB_PAD); skb_reset_network_header(skb); skb_reserve(skb, sizeof(struct iphdr)); skb_reset_transport_header(skb); /* Insert 0 session_id */ *((__be32 *)skb_put(skb, 4)) = 0; /* Copy user data into skb */ rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc < 0) { kfree_skb(skb); goto error; } fl4 = &inet->cork.fl.u.ip4; if (connected) rt = (struct rtable *)__sk_dst_check(sk, 0); rcu_read_lock(); if (!rt) { const struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference(inet->inet_opt); /* Use correct destination address if we have options. */ if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; if (connected) { sk_setup_caps(sk, &rt->dst); } else { skb_dst_set(skb, &rt->dst); goto xmit; } } /* We don't need to clone dst here, it is guaranteed to not disappear. * __dev_xmit_skb() might force a refcount if needed. */ skb_dst_set_noref(skb, &rt->dst); xmit: /* Queue the packet to IP for output */ rc = ip_queue_xmit(sk, skb, &inet->cork.fl); rcu_read_unlock(); error: if (rc >= 0) rc = len; out: release_sock(sk); return rc; no_route: rcu_read_unlock(); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); rc = -EHOSTUNREACH; goto out; } static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } int l2tp_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; switch (cmd) { case SIOCOUTQ: *karg = sk_wmem_alloc_get(sk); break; case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); break; default: return -ENOIOCTLCMD; } return 0; } EXPORT_SYMBOL_GPL(l2tp_ioctl); static struct proto l2tp_ip_prot = { .name = "L2TP/IP", .owner = THIS_MODULE, .init = l2tp_ip_open, .close = l2tp_ip_close, .bind = l2tp_ip_bind, .connect = l2tp_ip_connect, .disconnect = l2tp_ip_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip_destroy_sock, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = l2tp_ip_sendmsg, .recvmsg = l2tp_ip_recvmsg, .backlog_rcv = l2tp_ip_backlog_recv, .hash = l2tp_ip_hash, .unhash = l2tp_ip_unhash, .obj_size = sizeof(struct l2tp_ip_sock), }; static const struct proto_ops l2tp_ip_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, .poll = datagram_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; static struct inet_protosw l2tp_ip_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip_prot, .ops = &l2tp_ip_ops, }; static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, }; static int __init l2tp_ip_init(void) { int err; pr_info("L2TP IP encapsulation support (L2TPv3)\n"); err = proto_register(&l2tp_ip_prot, 1); if (err != 0) goto out; err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); if (err) goto out1; inet_register_protosw(&l2tp_ip_protosw); return 0; out1: proto_unregister(&l2tp_ip_prot); out: return err; } static void __exit l2tp_ip_exit(void) { inet_unregister_protosw(&l2tp_ip_protosw); inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip_prot); } module_init(l2tp_ip_init); module_exit(l2tp_ip_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP over IP"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET, 115);
875 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* SPDX-License-Identifier: GPL-2.0 */ /* * Released under the GPLv2 only. */ #include <linux/pm.h> #include <linux/acpi.h> struct usb_hub_descriptor; struct usb_dev_state; /* Functions local to drivers/usb/core/ */ extern int usb_create_sysfs_dev_files(struct usb_device *dev); extern void usb_remove_sysfs_dev_files(struct usb_device *dev); extern void usb_create_sysfs_intf_files(struct usb_interface *intf); extern void usb_remove_sysfs_intf_files(struct usb_interface *intf); extern int usb_update_wireless_status_attr(struct usb_interface *intf); extern int usb_create_ep_devs(struct device *parent, struct usb_host_endpoint *endpoint, struct usb_device *udev); extern void usb_remove_ep_devs(struct usb_host_endpoint *endpoint); extern void usb_enable_endpoint(struct usb_device *dev, struct usb_host_endpoint *ep, bool reset_toggle); extern void usb_enable_interface(struct usb_device *dev, struct usb_interface *intf, bool reset_toggles); extern void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr, bool reset_hardware); extern void usb_disable_interface(struct usb_device *dev, struct usb_interface *intf, bool reset_hardware); extern void usb_release_interface_cache(struct kref *ref); extern void usb_disable_device(struct usb_device *dev, int skip_ep0); extern int usb_deauthorize_device(struct usb_device *); extern int usb_authorize_device(struct usb_device *); extern void usb_deauthorize_interface(struct usb_interface *); extern void usb_authorize_interface(struct usb_interface *); extern void usb_detect_quirks(struct usb_device *udev); extern void usb_detect_interface_quirks(struct usb_device *udev); extern void usb_release_quirk_list(void); extern bool usb_endpoint_is_ignored(struct usb_device *udev, struct usb_host_interface *intf, struct usb_endpoint_descriptor *epd); extern int usb_remove_device(struct usb_device *udev); extern struct usb_device_descriptor *usb_get_device_descriptor( struct usb_device *udev); extern int usb_set_isoch_delay(struct usb_device *dev); extern int usb_get_bos_descriptor(struct usb_device *dev); extern void usb_release_bos_descriptor(struct usb_device *dev); extern int usb_set_configuration(struct usb_device *dev, int configuration); extern int usb_choose_configuration(struct usb_device *udev); extern int usb_generic_driver_probe(struct usb_device *udev); extern void usb_generic_driver_disconnect(struct usb_device *udev); extern int usb_generic_driver_suspend(struct usb_device *udev, pm_message_t msg); extern int usb_generic_driver_resume(struct usb_device *udev, pm_message_t msg); static inline unsigned usb_get_max_power(struct usb_device *udev, struct usb_host_config *c) { /* SuperSpeed power is in 8 mA units; others are in 2 mA units */ unsigned mul = (udev->speed >= USB_SPEED_SUPER ? 8 : 2); return c->desc.bMaxPower * mul; } extern void usb_kick_hub_wq(struct usb_device *dev); extern int usb_match_one_id_intf(struct usb_device *dev, struct usb_host_interface *intf, const struct usb_device_id *id); extern int usb_match_device(struct usb_device *dev, const struct usb_device_id *id); extern const struct usb_device_id *usb_device_match_id(struct usb_device *udev, const struct usb_device_id *id); extern bool usb_driver_applicable(struct usb_device *udev, struct usb_device_driver *udrv); extern void usb_forced_unbind_intf(struct usb_interface *intf); extern void usb_unbind_and_rebind_marked_interfaces(struct usb_device *udev); extern void usb_hub_release_all_ports(struct usb_device *hdev, struct usb_dev_state *owner); extern bool usb_device_is_owned(struct usb_device *udev); extern int usb_hub_init(void); extern void usb_hub_cleanup(void); extern int usb_major_init(void); extern void usb_major_cleanup(void); extern int usb_device_supports_lpm(struct usb_device *udev); extern int usb_port_disable(struct usb_device *udev); #ifdef CONFIG_PM extern int usb_suspend(struct device *dev, pm_message_t msg); extern int usb_resume(struct device *dev, pm_message_t msg); extern int usb_resume_complete(struct device *dev); extern int usb_port_suspend(struct usb_device *dev, pm_message_t msg); extern int usb_port_resume(struct usb_device *dev, pm_message_t msg); extern void usb_autosuspend_device(struct usb_device *udev); extern int usb_autoresume_device(struct usb_device *udev); extern int usb_remote_wakeup(struct usb_device *dev); extern int usb_runtime_suspend(struct device *dev); extern int usb_runtime_resume(struct device *dev); extern int usb_runtime_idle(struct device *dev); extern int usb_enable_usb2_hardware_lpm(struct usb_device *udev); extern int usb_disable_usb2_hardware_lpm(struct usb_device *udev); extern void usbfs_notify_suspend(struct usb_device *udev); extern void usbfs_notify_resume(struct usb_device *udev); #else static inline int usb_port_suspend(struct usb_device *udev, pm_message_t msg) { return 0; } static inline int usb_port_resume(struct usb_device *udev, pm_message_t msg) { return 0; } #define usb_autosuspend_device(udev) do {} while (0) static inline int usb_autoresume_device(struct usb_device *udev) { return 0; } static inline int usb_enable_usb2_hardware_lpm(struct usb_device *udev) { return 0; } static inline int usb_disable_usb2_hardware_lpm(struct usb_device *udev) { return 0; } #endif extern const struct class usbmisc_class; extern const struct bus_type usb_bus_type; extern struct mutex usb_port_peer_mutex; extern struct device_type usb_device_type; extern struct device_type usb_if_device_type; extern struct device_type usb_ep_device_type; extern struct device_type usb_port_device_type; extern struct usb_device_driver usb_generic_driver; static inline int is_usb_device(const struct device *dev) { return dev->type == &usb_device_type; } static inline int is_usb_interface(const struct device *dev) { return dev->type == &usb_if_device_type; } static inline int is_usb_endpoint(const struct device *dev) { return dev->type == &usb_ep_device_type; } static inline int is_usb_port(const struct device *dev) { return dev->type == &usb_port_device_type; } static inline int is_root_hub(struct usb_device *udev) { return (udev->parent == NULL); } /* Do the same for device drivers and interface drivers. */ static inline int is_usb_device_driver(struct device_driver *drv) { return container_of(drv, struct usbdrv_wrap, driver)-> for_devices; } /* for labeling diagnostics */ extern const char *usbcore_name; /* sysfs stuff */ extern const struct attribute_group *usb_device_groups[]; extern const struct attribute_group *usb_interface_groups[]; /* usbfs stuff */ extern struct usb_driver usbfs_driver; extern const struct file_operations usbfs_devices_fops; extern const struct file_operations usbdev_file_operations; extern int usb_devio_init(void); extern void usb_devio_cleanup(void); /* * Firmware specific cookie identifying a port's location. '0' == no location * data available */ typedef u32 usb_port_location_t; /* internal notify stuff */ extern void usb_notify_add_device(struct usb_device *udev); extern void usb_notify_remove_device(struct usb_device *udev); extern void usb_notify_add_bus(struct usb_bus *ubus); extern void usb_notify_remove_bus(struct usb_bus *ubus); extern void usb_hub_adjust_deviceremovable(struct usb_device *hdev, struct usb_hub_descriptor *desc); #ifdef CONFIG_ACPI extern int usb_acpi_register(void); extern void usb_acpi_unregister(void); extern acpi_handle usb_get_hub_port_acpi_handle(struct usb_device *hdev, int port1); #else static inline int usb_acpi_register(void) { return 0; }; static inline void usb_acpi_unregister(void) { }; #endif
2 56 5 56 5 1 1 2 3 8 56 1 1 1 1 1 1 1 56 56 8 8 8 8 8 18 17 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* * INETPEER - A storage for permanent information about peers * * This source is covered by the GNU GPL, the same as all kernel sources. * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #include <linux/cache.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/timer.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> #include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> #include <net/secure_seq.h> /* * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. * * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can * also be removed if the pool is overloaded i.e. if the total amount of * entries is greater-or-equal than the threshold. * * Node pool is organised as an RB tree. * Such an implementation has been chosen not just for fun. It's a way to * prevent easy and efficient DoS attacks by creating hash collisions. A huge * amount of long living nodes in a single hash slot would significantly delay * lookups performed with disabled BHs. * * Serialisation issues. * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. * 3. Global variable peer_total is modified under the pool lock. * 4. struct inet_peer fields modification: * rb_node: pool lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable */ static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { bp->rb_root = RB_ROOT; seqlock_init(&bp->lock); bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly; /* start to throw entries more * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { u64 nr_entries; /* 1% of physical memory */ nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT, 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer))); inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); peer_cachep = kmem_cache_create("inet_peer_cache", sizeof(struct inet_peer), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } /* Called with rcu_read_lock() or base->lock held */ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, struct inet_peer_base *base, unsigned int seq, struct inet_peer *gc_stack[], unsigned int *gc_cnt, struct rb_node **parent_p, struct rb_node ***pp_p) { struct rb_node **pp, *parent, *next; struct inet_peer *p; pp = &base->rb_root.rb_node; parent = NULL; while (1) { int cmp; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { if (!refcount_inc_not_zero(&p->refcnt)) break; return p; } if (gc_stack) { if (*gc_cnt < PEER_MAX_GC) gc_stack[(*gc_cnt)++] = p; } else if (unlikely(read_seqretry(&base->lock, seq))) { break; } if (cmp == -1) pp = &next->rb_left; else pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; return NULL; } static void inetpeer_free_rcu(struct rcu_head *head) { kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); } /* perform garbage collect on all items stacked during a lookup */ static void inet_peer_gc(struct inet_peer_base *base, struct inet_peer *gc_stack[], unsigned int gc_cnt) { int peer_threshold, peer_maxttl, peer_minttl; struct inet_peer *p; __u32 delta, ttl; int i; peer_threshold = READ_ONCE(inet_peer_threshold); peer_maxttl = READ_ONCE(inet_peer_maxttl); peer_minttl = READ_ONCE(inet_peer_minttl); if (base->total >= peer_threshold) ttl = 0; /* be aggressive */ else ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ * base->total / peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; /* The READ_ONCE() pairs with the WRITE_ONCE() * in inet_putpeer() */ delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; if (p) { rb_erase(&p->rb_node, &base->rb_root); base->total--; call_rcu(&p->rcu, inetpeer_free_rcu); } } } struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); invalidated = read_seqretry(&base->lock, seq); rcu_read_unlock(); if (p) return p; /* If no writer did a change during our lookup, we can return early. */ if (!create && !invalidated) return NULL; /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ parent = NULL; write_seqlock_bh(&base->lock); gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); if (!p && create) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; refcount_set(&p->refcnt, 2); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->n_redirects = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first * calculation of tokens is at its maximum. */ p->rate_last = jiffies - 60*HZ; rb_link_node(&p->rb_node, parent, pp); rb_insert_color(&p->rb_node, &base->rb_root); base->total++; } } if (gc_cnt) inet_peer_gc(base, gc_stack, gc_cnt); write_sequnlock_bh(&base->lock); return p; } EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { /* The WRITE_ONCE() pairs with itself (we run lockless) * and the READ_ONCE() in inet_peer_gc() */ WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); } EXPORT_SYMBOL_GPL(inet_putpeer); /* * Check transmit rate limitation for given message. * The rate information is held in the inet_peer entries now. * This function is generic and could be used for other purposes * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * * Note that the same inet_peer fields are modified by functions in * route.c too, but these work for packet destinations while xrlim_allow * works for icmp destinations. This means the rate limiting information * for one "ip object" is shared - and these ICMPs are twice limited: * by source and by destination. * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate * SHOULD allow setting of rate limits * * Shared between ICMPv4 and ICMPv6. */ #define XRLIM_BURST_FACTOR 6 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) { unsigned long now, token; bool rc = false; if (!peer) return true; token = peer->rate_tokens; now = jiffies; token += now - peer->rate_last; peer->rate_last = now; if (token > XRLIM_BURST_FACTOR * timeout) token = XRLIM_BURST_FACTOR * timeout; if (token >= timeout) { token -= timeout; rc = true; } peer->rate_tokens = token; return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { struct rb_node *p = rb_first(&base->rb_root); while (p) { struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); p = rb_next(p); rb_erase(&peer->rb_node, &base->rb_root); inet_putpeer(peer); cond_resched(); } base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree);
1262 1 2 3 4 5 6 7 8 9 10 11 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_NS_HASH_H__ #define __NET_NS_HASH_H__ #include <net/net_namespace.h> static inline u32 net_hash_mix(const struct net *net) { return net->hash_mix; } #endif
23 212 414 414 413 330 325 329 329 329 329 329 329 327 271 328 330 191 194 194 194 194 23 23 23 23 23 23 23 22 23 194 194 194 194 194 194 194 194 194 194 194 194 170 194 194 194 23 23 23 23 23 221 221 220 33 33 17 33 1073 1072 1073 1072 1073 1072 1072 189 1073 1070 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/file.c - kernfs file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/uio.h> #include "kernfs-internal.h" struct kernfs_open_node { struct rcu_head rcu_head; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ unsigned int nr_mmapped; unsigned int nr_to_release; }; /* * kernfs_notify() may be called from any context and bounces notifications * through a work item. To minimize space overhead in kernfs_node, the * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next * pointer for %NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) { int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); return &kernfs_locks->open_file_mutex[idx]; } static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) { struct mutex *lock; lock = kernfs_open_file_mutex_ptr(kn); mutex_lock(lock); return lock; } /** * of_on - Get the kernfs_open_node of the specified kernfs_open_file * @of: target kernfs_open_file * * Return: the kernfs_open_node of the kernfs_open_file */ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { return rcu_dereference_protected(of->kn->attr.open, !list_empty(&of->list)); } /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * * Fetch and return ->attr.open of @kn when caller holds the * kernfs_open_file_mutex_ptr(kn). * * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when * the caller guarantees that this mutex is being held, other updaters can't * change ->attr.open and this means that we can safely deref ->attr.open * outside RCU read-side critical section. * * The caller needs to make sure that kernfs_open_file_mutex is held. * * Return: @kn->attr.open when kernfs_open_file_mutex is held. */ static struct kernfs_open_node * kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); } static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; } /* * Determine the kernfs_ops for the given kernfs_node. This function must * be called while holding an active reference. */ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) { if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kn->attr.ops; } /* * As kernfs_seq_stop() is also called after kernfs_seq_start() or * kernfs_seq_next() failure, it needs to distinguish whether it's stopping * a seq_file iteration which is fully initialized with an active reference * or an aborted kernfs_seq_start() due to get_active failure. The * position pointer is the only context for each seq_file iteration and * thus the stop condition should be encoded in it. As the return value is * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable * choice to indicate get_active failure. * * Unfortunately, this is complicated due to the optional custom seq_file * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or * custom seq_file operations and thus can't decide whether put_active * should be performed or not only on ERR_PTR(-ENODEV). * * This is worked around by factoring out the custom seq_stop() and * put_active part into kernfs_seq_stop_active(), skipping it from * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures * that kernfs_seq_stop_active() is skipped only after get_active failure. */ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_stop) ops->seq_stop(sf, v); kernfs_put_active(of->kn); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); if (ops->seq_start) { void *next = ops->seq_start(sf, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_next) { void *next = ops->seq_next(sf, v, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(), always * terminate after the initial read. */ ++*ppos; return NULL; } } static void kernfs_seq_stop(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; if (v != ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, v); mutex_unlock(&of->mutex); } static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } static const struct seq_operations kernfs_seq_ops = { .start = kernfs_seq_start, .next = kernfs_seq_next, .stop = kernfs_seq_stop, .show = kernfs_seq_show, }; /* * As reading a bin file can have side-effects, the exact offset and bytes * specified in read(2) call should be passed to the read callback making * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; } of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len < 0) goto out_free; if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) return seq_read_iter(iocb, iter); return kernfs_file_read_iter(iocb, iter); } /* * Copy data in from userland and pass it to the matching kernfs write * operation. * * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, * modify only the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; char *buf; if (of->atomic_write_len) { if (len > of->atomic_write_len) return -E2BIG; } else { len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } buf[len] = '\0'; /* guarantee string termination */ /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; } ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len > 0) iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static void kernfs_vma_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); if (!of->vm_ops) return; if (!kernfs_get_active(of->kn)) return; if (of->vm_ops->open) of->vm_ops->open(vma); kernfs_put_active(of->kn); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); kernfs_put_active(of->kn); return ret; } static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); kernfs_put_active(of->kn); return ret; } static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return -EINVAL; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); kernfs_put_active(of->kn); return ret; } static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; int rc; /* * mmap path and of->mutex are prone to triggering spurious lockdep * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the * comment in kernfs_file_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; mutex_lock(&of->mutex); rc = -ENODEV; if (!kernfs_get_active(of->kn)) goto out_unlock; ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); if (rc) goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() * to satisfy versions of X which crash if the mmap fails: that * substitutes a new vm_file, and we don't then want bin_vm_ops. */ if (vma->vm_file != file) goto out_put; rc = -EINVAL; if (of->mmapped && of->vm_ops != vma->vm_ops) goto out_put; /* * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ if (vma->vm_ops && vma->vm_ops->close) goto out_put; rc = 0; of->mmapped = true; of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active(of->kn); out_unlock: mutex_unlock(&of->mutex); return rc; } /** * kernfs_get_open_node - get or create kernfs_open_node * @kn: target kernfs_node * @of: kernfs_open_file for this instance of open * * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * * Locking: * Kernel thread context (may sleep). * * Return: * %0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { /* not there, initialize a new one */ on = kzalloc(sizeof(*on), GFP_KERNEL); if (!on) { mutex_unlock(mutex); return -ENOMEM; } atomic_set(&on->event, 1); init_waitqueue_head(&on->poll); INIT_LIST_HEAD(&on->files); rcu_assign_pointer(kn->attr.open, on); } list_add_tail(&of->list, &on->files); if (kn->flags & KERNFS_HAS_RELEASE) on->nr_to_release++; mutex_unlock(mutex); return 0; } /** * kernfs_unlink_open_file - Unlink @of from @kn. * * @kn: target kernfs_node * @of: associated kernfs_open_file * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free * kernfs_open_node. * * LOCKING: * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of, bool open_failed) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } if (of) { if (kn->flags & KERNFS_HAS_RELEASE) { WARN_ON_ONCE(of->released == open_failed); if (open_failed) on->nr_to_release--; } if (of->mmapped) on->nr_mmapped--; list_del(&of->list); } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); kfree_rcu(on, rcu_head); } mutex_unlock(mutex); } static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; int error = -EACCES; if (!kernfs_get_active(kn)) return -ENODEV; ops = kernfs_ops(kn); has_read = ops->seq_show || ops->read || ops->mmap; has_write = ops->write || ops->mmap; has_mmap = ops->mmap; /* see the flag definition for details */ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { if ((file->f_mode & FMODE_WRITE) && (!(inode->i_mode & S_IWUGO) || !has_write)) goto err_out; if ((file->f_mode & FMODE_READ) && (!(inode->i_mode & S_IRUGO) || !has_read)) goto err_out; } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); if (!of) goto err_out; /* * The following is done to give a different lockdep key to * @of->mutex for files which implement mmap. This is a rather * crude way to avoid false positive lockdep warning around * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under * which mm->mmap_lock nests, while holding @of->mutex. As each * open file has a separate mutex, it's okay as long as those don't * happen on the same file. At this point, we can't easily give * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * * Both paths of the branch look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); else mutex_init(&of->mutex); of->kn = kn; of->file = file; /* * Write path needs to atomic_write_len outside active reference. * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; error = -EINVAL; /* * ->seq_show is incompatible with ->prealloc, * as seq_read does its own allocation. * ->read must be used instead. */ if (ops->prealloc && ops->seq_show) goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); error = -ENOMEM; if (!of->prealloc_buf) goto err_free; mutex_init(&of->prealloc_mutex); } /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. */ if (ops->seq_show) error = seq_open(file, &kernfs_seq_ops); else error = seq_open(file, NULL); if (error) goto err_free; of->seq_file = file->private_data; of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) file->f_mode |= FMODE_PWRITE; /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) goto err_seq_release; if (ops->open) { /* nobody has access to @of yet, skip @of->mutex */ error = ops->open(of); if (error) goto err_put_node; } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; err_put_node: kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); return error; } /* used from release/drain to ensure that ->release() is called exactly once */ static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); if (!of->released) { /* * A file is never detached without being released and we * need to be able to release files which are deactivated * and being drained. Don't use kernfs_ops(). */ kn->attr.ops->release(of); of->released = true; of_on(of)->nr_to_release--; } } static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); return 0; } bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; bool ret; /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. */ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); rcu_read_lock(); on = rcu_dereference(kn->attr.open); ret = on && (on->nr_mmapped || on->nr_to_release); rcu_read_unlock(); return ret; } void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); of->mmapped = false; on->nr_mmapped--; } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } /* * Kernfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the * manager for the kobject supports notification), poll will * return EPOLLERR|EPOLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); if (of->event != atomic_read(&on->event)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; return DEFAULT_POLLMASK; } static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; if (!kernfs_get_active(kn)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) ret = kn->attr.ops->poll(of, wait); else ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); return ret; } static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; loff_t ret; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); return -ENODEV; } ops = kernfs_ops(of->kn); if (ops->llseek) ret = ops->llseek(of, offset, whence); else ret = generic_file_llseek(file, offset, whence); kernfs_put_active(of->kn); mutex_unlock(&of->mutex); return ret; } static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); kn = kernfs_notify_list; if (kn == KERNFS_NOTIFY_EOL) { spin_unlock_irq(&kernfs_notify_lock); return; } kernfs_notify_list = kn->attr.notify_next; kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); root = kernfs_root(kn); /* kick fsnotify */ down_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; struct inode *inode; struct qstr name; /* * We want fsnotify_modify() on @kn but as the * modifications aren't originating from userland don't * have the matching @file available. Look up the inodes * and generate the events manually. */ inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name)); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, p_inode, &name, inode, 0); iput(p_inode); } kernfs_put(parent); } if (!p_inode) fsnotify_inode(inode, FS_MODIFY); iput(inode); } up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } /** * kernfs_notify - notify a kernfs file * @kn: file to notify * * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any * context. */ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; struct kernfs_open_node *on; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; /* kick poll immediately */ rcu_read_lock(); on = rcu_dereference(kn->attr.open); if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } rcu_read_unlock(); /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); kn->attr.notify_next = kernfs_notify_list; kernfs_notify_list = kn; schedule_work(&kernfs_notify_work); } spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, .llseek = kernfs_fop_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; /** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @uid: uid of the file * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Return: the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "kn->active", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif /* * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn; }
65 9 5 50 50 50 3 115 115 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 /* SPDX-License-Identifier: GPL-2.0-only */ /* include/net/xdp.h * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ #include <linux/bitfield.h> #include <linux/filter.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how * the driver have configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP * data-path read-access to RX-info for both kernel and bpf-side * (limited subset). * * For now, direct access is only safe while running in NAPI/softirq * context. Contents are read-mostly and must not be updated during * driver NAPI/softirq poll. * * The driver usage API is a register and unregister API. * * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver * naturally need to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. */ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; }; struct page_pool; struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; struct xdp_mem_info mem; unsigned int napi_id; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { struct net_device *dev; }; enum xdp_buff_flags { XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ }; struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_HAS_FRAGS; } static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) { xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->frame_sz = frame_sz; xdp->rxq = rxq; xdp->flags = 0; } static __always_inline void xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, int headroom, int data_len, const bool meta_valid) { unsigned char *data = hard_start + headroom; xdp->data_hard_start = hard_start; xdp->data = data; xdp->data_end = data + data_len; xdp->data_meta = meta_valid ? data : data + 1; } /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the * XDP/BPF data access to data_hard_end. Notice same area (and size) * is used for XDP_PASS, when constructing the SKB via build_skb(). */ #define xdp_data_hard_end(xdp) \ ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * xdp_get_shared_info_from_buff(struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); len += sinfo->xdp_frags_size; out: return len; } struct xdp_frame { void *data; u16 len; u16 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; void *xa; void *q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { /* bq->count will be zero'ed when bq->xa gets updated */ bq->xa = NULL; } static inline struct skb_shared_info * xdp_get_shared_info_from_frame(struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); return (struct skb_shared_info *)(data_hard_start + frame->frame_sz - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); } struct xdp_cpumap_stats { unsigned int redirect; unsigned int pass; unsigned int drop; }; /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { frame->data = NULL; frame->dev_rx = NULL; } static inline void xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { skb_shinfo(skb)->nr_frags = nr_frags; skb->len += size; skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= pfmemalloc; } /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; xdp->flags = frame->flags; } static inline int xdp_update_frame_from_buff(struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return -ENOSPC; /* Catch if driver didn't reserve tailroom for skb_shared_info */ if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { XDP_WARN("Driver BUG: missing reserved tailroom"); return -ENOSPC; } xdp_frame->data = xdp->data; xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; xdp_frame->flags = xdp->flags; return 0; } /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) { struct xdp_frame *xdp_frame; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; return xdp_frame; } void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); len += sinfo->xdp_frags_size; out: return len; } int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); static inline int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id) { return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); } void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. */ static __always_inline void xdp_set_data_meta_invalid(struct xdp_buff *xdp) { xdp->data_meta = xdp->data + 1; } static __always_inline bool xdp_data_meta_unsupported(const struct xdp_buff *xdp) { return unlikely(xdp->data_meta > xdp->data); } static inline bool xdp_metalen_invalid(unsigned long metalen) { return (metalen & (sizeof(__u32) - 1)) || (metalen > 32); } struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; }; struct netdev_bpf; void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, }; enum xdp_rss_hash_type { /* First part: Individual bits for L3/L4 types */ XDP_RSS_L3_IPV4 = BIT(0), XDP_RSS_L3_IPV6 = BIT(1), /* The fixed (L3) IPv4 and IPv6 headers can both be followed by * variable/dynamic headers, IPv4 called Options and IPv6 called * Extension Headers. HW RSS type can contain this info. */ XDP_RSS_L3_DYNHDR = BIT(2), /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in * addition to the protocol specific bit. This ease interaction with * SKBs and avoids reserving a fixed mask for future L4 protocol bits. */ XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ XDP_RSS_L4_TCP = BIT(4), XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); }; #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } static inline void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) { } static inline void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { } static inline void xdp_features_clear_redirect_target(struct net_device *dev) { } #endif static inline void xdp_clear_features_flag(struct net_device *dev) { xdp_set_features_flag(dev, 0); } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) act = xdp_master_redirect(xdp); } return act; } #endif /* __LINUX_NET_XDP_H__ */
2 6 3 4 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for the UDP-Lite (RFC 3828) code. */ #ifndef _UDPLITE_H #define _UDPLITE_H #include <net/ip6_checksum.h> #include <net/udp.h> /* UDP-Lite socket options */ #define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ #define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ extern struct proto udplite_prot; extern struct udp_table udplite_table; /* * Checksum computation is all in software, hence simpler getfrag. */ static __inline__ int udplite_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct msghdr *msg = from; return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT; } /* * Checksumming routines */ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) { u16 cscov; /* In UDPv4 a zero checksum means that the transmitter generated no * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets * with a zero checksum field are illegal. */ if (uh->check == 0) { net_dbg_ratelimited("UDPLite: zeroed checksum field\n"); return 1; } cscov = ntohs(uh->len); if (cscov == 0) /* Indicates that full coverage is required. */ ; else if (cscov < 8 || cscov > skb->len) { /* * Coverage length violates RFC 3828: log and discard silently. */ net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n", cscov, skb->len); return 1; } else if (cscov < skb->len) { UDP_SKB_CB(skb)->partial_cov = 1; UDP_SKB_CB(skb)->cscov = cscov; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; skb->csum_valid = 0; } return 0; } /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) { const int off = skb_transport_offset(skb); const struct sock *sk = skb->sk; int len = skb->len - off; if (udp_test_bit(UDPLITE_SEND_CC, sk)) { u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen); if (pcslen < len) { if (pcslen > 0) len = pcslen; udp_hdr(skb)->len = htons(pcslen); } } skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ return skb_checksum(skb, off, len, 0); } void udplite4_register(void); #endif /* _UDPLITE_H */
19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 3 16 16 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 // SPDX-License-Identifier: GPL-2.0 /* * xfrm_input.c * * Changes: * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * */ #include <linux/bottom_half.h> #include <linux/cache.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/percpu.h> #include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> #include <net/dst_metadata.h> #include "xfrm_inout.h" struct xfrm_trans_tasklet { struct work_struct work; spinlock_t queue_lock; struct sk_buff_head queue; }; struct xfrm_trans_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); struct net *net; }; #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1]; static struct gro_cells gro_cells; static struct net_device xfrm_napi_dev; static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; if (WARN_ON(afinfo->family > AF_INET6)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) err = -EEXIST; else rcu_assign_pointer(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], afinfo); spin_unlock_bh(&xfrm_input_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_input_register_afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; spin_lock_bh(&xfrm_input_afinfo_lock); if (likely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) { if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family] != afinfo)) err = -EINVAL; else RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], NULL); } spin_unlock_bh(&xfrm_input_afinfo_lock); synchronize_rcu(); return err; } EXPORT_SYMBOL(xfrm_input_unregister_afinfo); static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(u8 family, bool is_ipip) { const struct xfrm_input_afinfo *afinfo; if (WARN_ON_ONCE(family > AF_INET6)) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_input_afinfo[is_ipip][family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, int err) { bool is_ipip = (protocol == IPPROTO_IPIP || protocol == IPPROTO_IPV6); const struct xfrm_input_afinfo *afinfo; int ret; afinfo = xfrm_input_get_afinfo(family, is_ipip); if (!afinfo) return -EAFNOSUPPORT; ret = afinfo->callback(skb, protocol, err); rcu_read_unlock(); return ret; } struct sec_path *secpath_set(struct sk_buff *skb) { struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH); sp = skb_ext_add(skb, SKB_EXT_SEC_PATH); if (!sp) return NULL; if (tmp) /* reused existing one (was COW'd if needed) */ return sp; /* allocated new secpath */ memset(sp->ovec, 0, sizeof(sp->ovec)); sp->olen = 0; sp->len = 0; sp->verified_cnt = 0; return sp; } EXPORT_SYMBOL(secpath_set); /* Fetch spi and seq from ipsec header */ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) { int offset, offset_seq; int hlen; switch (nexthdr) { case IPPROTO_AH: hlen = sizeof(struct ip_auth_hdr); offset = offsetof(struct ip_auth_hdr, spi); offset_seq = offsetof(struct ip_auth_hdr, seq_no); break; case IPPROTO_ESP: hlen = sizeof(struct ip_esp_hdr); offset = offsetof(struct ip_esp_hdr, spi); offset_seq = offsetof(struct ip_esp_hdr, seq_no); break; case IPPROTO_COMP: if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr))) return -EINVAL; *spi = htonl(ntohs(*(__be16 *)(skb_transport_header(skb) + 2))); *seq = 0; return 0; default: return 1; } if (!pskb_may_pull(skb, hlen)) return -EINVAL; *spi = *(__be32 *)(skb_transport_header(skb) + offset); *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq); return 0; } EXPORT_SYMBOL(xfrm_parse_spi); static int xfrm4_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) { struct iphdr *iph; int optlen = 0; int err = -EINVAL; skb->protocol = htons(ETH_P_IP); if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { struct ip_beet_phdr *ph; int phlen; if (!pskb_may_pull(skb, sizeof(*ph))) goto out; ph = (struct ip_beet_phdr *)skb->data; phlen = sizeof(*ph) + ph->padlen; optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); if (optlen < 0 || optlen & 3 || optlen > 250) goto out; XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; if (!pskb_may_pull(skb, phlen)) goto out; __skb_pull(skb, phlen); } skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); xfrm4_beet_make_header(skb); iph = ip_hdr(skb); iph->ihl += optlen / 4; iph->tot_len = htons(skb->len); iph->daddr = x->sel.daddr.a4; iph->saddr = x->sel.saddr.a4; iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); err = 0; out: return err; } static void ipip_ecn_decapsulate(struct sk_buff *skb) { struct iphdr *inner_iph = ipip_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP_ECN_set_ce(inner_iph); } static int xfrm4_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; skb->protocol = htons(ETH_P_IP); if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip_ecn_decapsulate(skb); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); if (skb->mac_len) eth_hdr(skb)->h_proto = skb->protocol; err = 0; out: return err; } static void ipip6_ecn_decapsulate(struct sk_buff *skb) { struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP6_ECN_set_ce(skb, inner_iph); } static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; skb->protocol = htons(ETH_P_IPV6); if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); if (skb->mac_len) eth_hdr(skb)->h_proto = skb->protocol; err = 0; out: return err; } static int xfrm6_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *ip6h; int size = sizeof(struct ipv6hdr); int err; skb->protocol = htons(ETH_P_IPV6); err = skb_cow_head(skb, size + skb->mac_len); if (err) goto out; __skb_push(skb, size); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); xfrm6_beet_make_header(skb); ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); ip6h->daddr = x->sel.daddr.in6; ip6h->saddr = x->sel.saddr.in6; err = 0; out: return err; } /* Remove encapsulation header. * * The IP header will be moved over the top of the encapsulation * header. * * On entry, the transport header shall point to where the IP header * should be and the network header shall be set to where the IP * header currently is. skb->data shall point to the start of the * payload. */ static int xfrm_inner_mode_encap_remove(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.mode) { case XFRM_MODE_BEET: switch (x->sel.family) { case AF_INET: return xfrm4_remove_beet_encap(x, skb); case AF_INET6: return xfrm6_remove_beet_encap(x, skb); } break; case XFRM_MODE_TUNNEL: switch (XFRM_MODE_SKB_CB(skb)->protocol) { case IPPROTO_IPIP: return xfrm4_remove_tunnel_encap(x, skb); case IPPROTO_IPV6: return xfrm6_remove_tunnel_encap(x, skb); break; } return -EINVAL; } WARN_ON_ONCE(1); return -EOPNOTSUPP; } static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.family) { case AF_INET: xfrm4_extract_header(skb); break; case AF_INET6: xfrm6_extract_header(skb); break; default: WARN_ON_ONCE(1); return -EAFNOSUPPORT; } return xfrm_inner_mode_encap_remove(x, skb); } /* Remove encapsulation header. * * The IP header will be moved over the top of the encapsulation header. * * On entry, skb_transport_header() shall point to where the IP header * should be and skb_network_header() shall be set to where the IP header * currently is. skb->data shall point to the start of the payload. */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { int ihl = skb->data - skb_transport_header(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), skb_network_header(skb), ihl); skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); skb_reset_transport_header(skb); return 0; } static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) int ihl = skb->data - skb_transport_header(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), skb_network_header(skb), ihl); skb->network_header = skb->transport_header; } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); return 0; #else WARN_ON_ONCE(1); return -EAFNOSUPPORT; #endif } static int xfrm_inner_mode_input(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.mode) { case XFRM_MODE_BEET: case XFRM_MODE_TUNNEL: return xfrm_prepare_input(x, skb); case XFRM_MODE_TRANSPORT: if (x->props.family == AF_INET) return xfrm4_transport_input(x, skb); if (x->props.family == AF_INET6) return xfrm6_transport_input(x, skb); break; case XFRM_MODE_ROUTEOPTIMIZATION: WARN_ON_ONCE(1); break; default: WARN_ON_ONCE(1); break; } return -EOPNOTSUPP; } int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { const struct xfrm_state_afinfo *afinfo; struct net *net = dev_net(skb->dev); int err; __be32 seq; __be32 seq_hi; struct xfrm_state *x = NULL; xfrm_address_t *daddr; u32 mark = skb->mark; unsigned int family = AF_UNSPEC; int decaps = 0; int async = 0; bool xfrm_gro = false; bool crypto_done = false; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (encap_type < 0 || (xo && xo->flags & XFRM_GRO)) { x = xfrm_input_state(skb); if (unlikely(x->km.state != XFRM_STATE_VALID)) { if (x->km.state == XFRM_STATE_ACQ) XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); if (encap_type == -1) dev_put(skb->dev); goto drop; } family = x->props.family; /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; seq = XFRM_SKB_CB(skb)->seq.input.low; goto resume; } /* GRO call */ seq = XFRM_SPI_SKB_CB(skb)->seq; if (xo && (xo->flags & CRYPTO_DONE)) { crypto_done = true; family = XFRM_SPI_SKB_CB(skb)->family; if (!(xo->status & CRYPTO_SUCCESS)) { if (xo->status & (CRYPTO_TRANSPORT_AH_AUTH_FAILED | CRYPTO_TRANSPORT_ESP_AUTH_FAILED | CRYPTO_TUNNEL_AH_AUTH_FAILED | CRYPTO_TUNNEL_ESP_AUTH_FAILED)) { xfrm_audit_state_icvfail(x, skb, x->type->proto); x->stats.integrity_failed++; XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop; } if (xo->status & CRYPTO_INVALID_PROTOCOL) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } if (xfrm_parse_spi(skb, nexthdr, &spi, &seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } } goto lock; } family = XFRM_SPI_SKB_CB(skb)->family; /* if tunnel is present override skb->mark value with tunnel i_key */ switch (family) { case AF_INET: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); break; case AF_INET6: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); break; } sp = secpath_set(skb); if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } seq = 0; if (!spi && xfrm_parse_spi(skb, nexthdr, &spi, &seq)) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } daddr = (xfrm_address_t *)(skb_network_header(skb) + XFRM_SPI_SKB_CB(skb)->daddroff); do { sp = skb_sec_path(skb); if (sp->len == XFRM_MAX_DEPTH) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); goto drop; } skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } lock: spin_lock(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { if (x->km.state == XFRM_STATE_ACQ) XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); goto drop_unlock; } if ((x->encap ? x->encap->encap_type : 0) != encap_type) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); goto drop_unlock; } if (xfrm_replay_check(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } if (xfrm_state_check_expire(x)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEEXPIRED); goto drop_unlock; } spin_unlock(&x->lock); if (xfrm_tunnel_check(skb, x, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } seq_hi = htonl(xfrm_replay_seqhi(x, seq)); XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; dev_hold(skb->dev); if (crypto_done) nexthdr = x->type_offload->input_tail(x, skb); else nexthdr = x->type->input(x, skb); if (nexthdr == -EINPROGRESS) return 0; resume: dev_put(skb->dev); spin_lock(&x->lock); if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, x->type->proto); x->stats.integrity_failed++; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop_unlock; } /* only the first xfrm gets the encap type */ encap_type = 0; if (xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } xfrm_replay_advance(x, seq); x->curlft.bytes += skb->len; x->curlft.packets++; x->lastused = ktime_get_real_seconds(); spin_unlock(&x->lock); XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; if (xfrm_inner_mode_input(x, skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) { decaps = 1; break; } /* * We need the inner address. However, we only get here for * transport mode so the outer address is identical. */ daddr = &x->id.daddr; family = x->props.family; err = xfrm_parse_spi(skb, nexthdr, &spi, &seq); if (err < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } crypto_done = false; } while (!err); err = xfrm_rcv_cb(skb, family, x->type->proto, 0); if (err) goto drop; nf_reset_ct(skb); if (decaps) { sp = skb_sec_path(skb); if (sp) sp->olen = 0; if (skb_valid_dst(skb)) skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; } else { xo = xfrm_offload(skb); if (xo) xfrm_gro = xo->flags & XFRM_GRO; err = -EAFNOSUPPORT; rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(x->props.family); if (likely(afinfo)) err = afinfo->transport_finish(skb, xfrm_gro || async); rcu_read_unlock(); if (xfrm_gro) { sp = skb_sec_path(skb); if (sp) sp->olen = 0; if (skb_valid_dst(skb)) skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; } return err; } drop_unlock: spin_unlock(&x->lock); drop: xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1); kfree_skb(skb); return 0; } EXPORT_SYMBOL(xfrm_input); int xfrm_input_resume(struct sk_buff *skb, int nexthdr) { return xfrm_input(skb, nexthdr, 0, -1); } EXPORT_SYMBOL(xfrm_input_resume); static void xfrm_trans_reinject(struct work_struct *work) { struct xfrm_trans_tasklet *trans = container_of(work, struct xfrm_trans_tasklet, work); struct sk_buff_head queue; struct sk_buff *skb; __skb_queue_head_init(&queue); spin_lock_bh(&trans->queue_lock); skb_queue_splice_init(&trans->queue, &queue); spin_unlock_bh(&trans->queue_lock); local_bh_disable(); while ((skb = __skb_dequeue(&queue))) XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, NULL, skb); local_bh_enable(); } int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)) { struct xfrm_trans_tasklet *trans; trans = this_cpu_ptr(&xfrm_trans_tasklet); if (skb_queue_len(&trans->queue) >= READ_ONCE(netdev_max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); XFRM_TRANS_SKB_CB(skb)->finish = finish; XFRM_TRANS_SKB_CB(skb)->net = net; spin_lock_bh(&trans->queue_lock); __skb_queue_tail(&trans->queue, skb); spin_unlock_bh(&trans->queue_lock); schedule_work(&trans->work); return 0; } EXPORT_SYMBOL(xfrm_trans_queue_net); int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)) { return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish); } EXPORT_SYMBOL(xfrm_trans_queue); void __init xfrm_input_init(void) { int err; int i; init_dummy_netdev(&xfrm_napi_dev); err = gro_cells_init(&gro_cells, &xfrm_napi_dev); if (err) gro_cells.cells = NULL; for_each_possible_cpu(i) { struct xfrm_trans_tasklet *trans; trans = &per_cpu(xfrm_trans_tasklet, i); spin_lock_init(&trans->queue_lock); __skb_queue_head_init(&trans->queue); INIT_WORK(&trans->work, xfrm_trans_reinject); } }
1468 1 22 22 2 2 2 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 4 3 3 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* gw.c - CAN frame Gateway/Router/Bridge with netlink interface * * Copyright (c) 2019 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS(CAN_GW_NAME); #define CGW_MIN_HOPS 1 #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; module_param(max_hops, uint, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" __stringify(CGW_MAX_HOPS) " hops, " "default: " __stringify(CGW_DEFAULT_HOPS) ")"); static struct notifier_block notifier; static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ struct cf_mod { struct { struct canfd_frame and; struct canfd_frame or; struct canfd_frame xor; struct canfd_frame set; } modframe; struct { u8 and; u8 or; u8 xor; u8 set; } modtype; void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf, struct cf_mod *mod); /* CAN frame checksum calculation after CAN frame modifications */ struct { struct cgw_csum_xor xor; struct cgw_csum_crc8 crc8; } csum; struct { void (*xor)(struct canfd_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; u32 uid; }; /* So far we just support CAN -> CAN routing and frame modifications. * * The internal can_can_gw structure contains data and attributes for * a CAN -> CAN gateway job. */ struct can_can_gw { struct can_filter filter; int src_idx; int dst_idx; }; /* list entry for CAN gateways jobs */ struct cgw_job { struct hlist_node list; struct rcu_head rcu; u32 handled_frames; u32 dropped_frames; u32 deleted_frames; struct cf_mod mod; union { /* CAN frame data source */ struct net_device *dev; } src; union { /* CAN frame data destination */ struct net_device *dev; } dst; union { struct can_can_gw ccgw; /* tbc */ }; u8 gwtype; u8 limit_hops; u16 flags; }; /* modification functions that are invoked in the hot path in can_can_gw_rcv */ #define MODFUNC(func, op) static void func(struct canfd_frame *cf, \ struct cf_mod *mod) { op ; } MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id) MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len) MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags) MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data) MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id) MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len) MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags) MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data) MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id) MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len) MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags) MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data) MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id) MODFUNC(mod_set_len, cf->len = mod->modframe.set.len) MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags) MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data) static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i); } static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i); } static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i); } static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) { memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); } /* retrieve valid CC DLC value and store it into 'len' */ static void mod_retrieve_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* len8_dlc is only valid if len == CAN_MAX_DLEN */ if (ccf->len != CAN_MAX_DLEN) return; /* do we have a valid len8_dlc value from 9 .. 15 ? */ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC) ccf->len = ccf->len8_dlc; } /* convert valid CC DLC value in 'len' into struct can_frame elements */ static void mod_store_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* clear potential leftovers */ ccf->len8_dlc = 0; /* plain data length 0 .. 8 - that was easy */ if (ccf->len <= CAN_MAX_DLEN) return; /* potentially broken values are caught in can_can_gw_rcv() */ if (ccf->len > CAN_MAX_RAW_DLC) return; /* we have a valid dlc value from 9 .. 15 in ccf->len */ ccf->len8_dlc = ccf->len; ccf->len = CAN_MAX_DLEN; } static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_and_len(cf, mod); mod_store_ccdlc(cf); } static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_or_len(cf, mod); mod_store_ccdlc(cf); } static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_xor_len(cf, mod); mod_store_ccdlc(cf); } static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_set_len(cf, mod); mod_store_ccdlc(cf); } static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 3 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->len = src->len; *(u64 *)dst->data = *(u64 *)src->data; } static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 2 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->flags = src->flags; dst->len = src->len; memcpy(dst->data, src->data, CANFD_MAX_DLEN); } static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r) { s8 dlen = CAN_MAX_DLEN; if (r->flags & CGW_FLAGS_CAN_FD) dlen = CANFD_MAX_DLEN; /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] * relative to received dlc -1 .. -8 : * e.g. for received dlc = 8 * -1 => index = 7 (data[7]) * -3 => index = 5 (data[5]) * -8 => index = 0 (data[0]) */ if (fr >= -dlen && fr < dlen && to >= -dlen && to < dlen && re >= -dlen && re < dlen) return 0; else return -EINVAL; } static inline int calc_idx(int idx, int rx_len) { if (idx < 0) return rx_len + idx; else return idx; } static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor) { int from = calc_idx(xor->from_idx, cf->len); int to = calc_idx(xor->to_idx, cf->len); int res = calc_idx(xor->result_idx, cf->len); u8 val = xor->init_xor_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = from; i <= to; i++) val ^= cf->data[i]; } else { for (i = from; i >= to; i--) val ^= cf->data[i]; } cf->data[res] = val; } static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i <= xor->to_idx; i++) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i >= xor->to_idx; i--) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_crc8_rel(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { int from = calc_idx(crc8->from_idx, cf->len); int to = calc_idx(crc8->to_idx, cf->len); int res = calc_idx(crc8->result_idx, cf->len); u8 crc = crc8->init_crc_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; } else { for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; } switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_pos(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_neg(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } /* the receive & process & send function */ static void can_can_gw_rcv(struct sk_buff *skb, void *data) { struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ if (gwj->flags & CGW_FLAGS_CAN_FD) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). * * The Controller Area Network controllers only accept CAN frames with * correct CRCs - which are not visible in the controller registers. * According to skbuff.h documentation the csum_start element for IP * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY. * Only CAN skbs can be processed here which already have this property. */ #define cgw_hops(skb) ((skb)->csum_start) BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); if (cgw_hops(skb) >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; } if (!(gwj->dst.dev->flags & IFF_UP)) { gwj->dropped_frames++; return; } /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) return; /* clone the given skb, which has not been done in can_rcv() * * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ if (gwj->mod.modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) { gwj->dropped_frames++; return; } /* put the incremented hop counter in the cloned skb */ cgw_hops(nskb) = cgw_hops(skb) + 1; /* first processing of this CAN frame -> adjust to private hop limit */ if (gwj->limit_hops && cgw_hops(nskb) == 1) cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); /* Has the CAN frame been modified? */ if (modidx) { /* get available space for the processed CAN frame type */ int max_len = nskb->len - offsetof(struct canfd_frame, data); /* dlc may have changed, make sure it fits to the CAN frame */ if (cf->len > max_len) { /* delete frame due to misconfiguration */ gwj->deleted_frames++; kfree_skb(nskb); return; } /* check for checksum updates */ if (gwj->mod.csumfunc.crc8) (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); if (gwj->mod.csumfunc.xor) (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); } /* clear the skb timestamp if not configured the other way */ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP)) nskb->tstamp = 0; /* send to netdevice */ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO)) gwj->dropped_frames++; else gwj->handled_frames++; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) { return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) { can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); kmem_cache_free(cgw_cache, gwj); } static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg == NETDEV_UNREGISTER) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } } return NOTIFY_DONE; } static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, u32 pid, u32 seq, int flags) { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) return -EMSGSIZE; rtcan = nlmsg_data(nlh); rtcan->can_family = AF_CAN; rtcan->gwtype = gwj->gwtype; rtcan->flags = gwj->flags; /* add statistics if available */ if (gwj->handled_frames) { if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0) goto cancel; } if (gwj->dropped_frames) { if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0) goto cancel; } if (gwj->deleted_frames) { if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0) goto cancel; } /* check non default settings of attributes */ if (gwj->limit_hops) { if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) goto cancel; } if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } if (gwj->mod.uid) { if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) goto cancel; } if (gwj->mod.csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &gwj->mod.csum.crc8) < 0) goto cancel; } if (gwj->mod.csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, &gwj->mod.csum.xor) < 0) goto cancel; } if (gwj->gwtype == CGW_TYPE_CAN_CAN) { if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) { if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter), &gwj->ccgw.filter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0) goto cancel; if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0) goto cancel; } nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) { if (idx < s_idx) goto cont; if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) break; cont: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static const struct nla_policy cgw_policy[CGW_MAX + 1] = { [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) }, [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) }, [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, [CGW_MOD_UID] = { .type = NLA_U32 }, [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX + 1]; struct rtcanmsg *r = nlmsg_data(nlh); int modidx = 0; int err = 0; /* initialize modification & checksum data space */ memset(mod, 0, sizeof(*mod)); err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, cgw_policy, NULL); if (err < 0) return err; if (tb[CGW_LIM_HOPS]) { *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); if (*limhops < 1 || *limhops > max_hops) return -EINVAL; } /* check for AND/OR/XOR/SET modifications */ if (r->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (tb[CGW_FDMOD_AND]) { nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_and_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_and_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_fddata; } if (tb[CGW_FDMOD_OR]) { nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_or_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_or_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_fddata; } if (tb[CGW_FDMOD_XOR]) { nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_xor_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_xor_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_fddata; } if (tb[CGW_FDMOD_SET]) { nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_set_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_set_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_fddata; } } else { struct cgw_frame_mod mb; if (tb[CGW_MOD_AND]) { nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); canframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_and_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; } if (tb[CGW_MOD_OR]) { nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_or_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; } if (tb[CGW_MOD_XOR]) { nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_xor_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; } if (tb[CGW_MOD_SET]) { nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); canframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_set_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; } } /* check for checksum operations after CAN frame modifications */ if (modidx) { if (tb[CGW_CS_CRC8]) { struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8], CGW_CS_CRC8_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.crc8 = cgw_csum_crc8_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.crc8 = cgw_csum_crc8_pos; else mod->csumfunc.crc8 = cgw_csum_crc8_neg; } if (tb[CGW_CS_XOR]) { struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR], CGW_CS_XOR_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.xor = cgw_csum_xor_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.xor = cgw_csum_xor_pos; else mod->csumfunc.xor = cgw_csum_xor_neg; } if (tb[CGW_MOD_UID]) nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); } if (gwtype == CGW_TYPE_CAN_CAN) { /* check CGW_TYPE_CAN_CAN specific attributes */ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr; memset(ccgw, 0, sizeof(*ccgw)); /* check for can_filter in attributes */ if (tb[CGW_FILTER]) nla_memcpy(&ccgw->filter, tb[CGW_FILTER], sizeof(struct can_filter)); err = -ENODEV; /* specifying two interfaces is mandatory */ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF]) return err; ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]); ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]); /* both indices set to 0 for flushing all routing entries */ if (!ccgw->src_idx && !ccgw->dst_idx) return 0; /* only one index set to 0 is an error */ if (!ccgw->src_idx || !ccgw->dst_idx) return err; } /* add the checks for other gwtypes here */ return 0; } static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; if (mod.uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { if (gwj->mod.uid != mod.uid) continue; /* interfaces & filters must be identical */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) return -EINVAL; /* update modifications with disabled softirq & quit */ local_bh_disable(); memcpy(&gwj->mod, &mod, sizeof(mod)); local_bh_enable(); return 0; } } /* ifindex == 0 is not allowed for job creation */ if (!ccgw.src_idx || !ccgw.dst_idx) return -ENODEV; gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) return -ENOMEM; gwj->handled_frames = 0; gwj->dropped_frames = 0; gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; gwj->limit_hops = limhops; /* insert already parsed information */ memcpy(&gwj->mod, &mod, sizeof(mod)); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; if (gwj->src.dev->type != ARPHRD_CAN) goto out; gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) goto out; if (gwj->dst.dev->type != ARPHRD_CAN) goto out; /* is sending the skb back to the incoming interface intended? */ if (gwj->src.dev == gwj->dst.dev && !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) { err = -EINVAL; goto out; } ASSERT_RTNL(); err = cgw_register_filter(net, gwj); if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: if (err) kmem_cache_free(cgw_cache, gwj); return err; } static void cgw_remove_all_jobs(struct net *net) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; /* two interface indices both set to 0 => remove all entries */ if (!ccgw.src_idx && !ccgw.dst_idx) { cgw_remove_all_jobs(net); return 0; } err = -EINVAL; ASSERT_RTNL(); /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; /* we have a match when uid is enabled and identical */ if (gwj->mod.uid || mod.uid) { if (gwj->mod.uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ if (memcmp(&gwj->mod, &mod, sizeof(mod))) continue; } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) continue; hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); err = 0; break; } return err; } static int __net_init cangw_pernet_init(struct net *net) { INIT_HLIST_HEAD(&net->can.cgw_list); return 0; } static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit_batch = cangw_pernet_exit_batch, }; static __init int cgw_module_init(void) { int ret; /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); pr_info("can: netlink gateway - max_hops=%d\n", max_hops); ret = register_pernet_subsys(&cangw_pernet_ops); if (ret) return ret; ret = -ENOMEM; cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); if (!cgw_cache) goto out_cache_create; /* set notifier */ notifier.notifier_call = cgw_notifier; ret = register_netdevice_notifier(&notifier); if (ret) goto out_register_notifier; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0); if (ret) goto out_rtnl_register1; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0); if (ret) goto out_rtnl_register2; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0); if (ret) goto out_rtnl_register3; return 0; out_rtnl_register3: rtnl_unregister(PF_CAN, RTM_NEWROUTE); out_rtnl_register2: rtnl_unregister(PF_CAN, RTM_GETROUTE); out_rtnl_register1: unregister_netdevice_notifier(&notifier); out_register_notifier: kmem_cache_destroy(cgw_cache); out_cache_create: unregister_pernet_subsys(&cangw_pernet_ops); return ret; } static __exit void cgw_module_exit(void) { rtnl_unregister_all(PF_CAN); unregister_netdevice_notifier(&notifier); unregister_pernet_subsys(&cangw_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(cgw_cache); } module_init(cgw_module_init); module_exit(cgw_module_exit);
1001 1001 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage high resolution tick * related events. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include "tick-internal.h" /** * tick_program_event - program the CPU local timer device for the next event */ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); if (unlikely(expires == KTIME_MAX)) { /* * We don't need the clock event device any more, stop it. */ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); dev->next_event = KTIME_MAX; return 0; } if (unlikely(clockevent_state_oneshot_stopped(dev))) { /* * We need the clock event again, configure it in ONESHOT mode * before using it. */ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); } return clockevents_program_event(dev, expires, force); } /** * tick_resume_oneshot - resume oneshot mode */ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t next_event) { newdev->event_handler = handler; clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } /** * tick_switch_to_oneshot - switch to oneshot mode */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || !tick_device_is_functional(dev)) { pr_info("Clockevents: could not switch to one-shot mode:"); if (!dev) { pr_cont(" no tick device\n"); } else { if (!tick_device_is_functional(dev)) pr_cont(" %s is not functional.\n", dev->name); else pr_cont(" %s does not support one-shot mode.\n", dev->name); } return -EINVAL; } td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * * returns 1 when either nohz or highres are enabled. otherwise 0. */ int tick_oneshot_mode_active(void) { unsigned long flags; int ret; local_irq_save(flags); ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; local_irq_restore(flags); return ret; } #ifdef CONFIG_HIGH_RES_TIMERS /** * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. */ int tick_init_highres(void) { return tick_switch_to_oneshot(hrtimer_interrupt); } #endif
43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_BLA_H_ #define _NET_BATMAN_ADV_BLA_H_ #include "main.h" #include <linux/compiler.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> /** * batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop * detect frame sent by bridge loop avoidance * @mac: mac address to check * * Return: true if the it looks like a loop detect frame * (mac starts with BA:BE), false otherwise */ static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac) { if (mac[0] == 0xba && mac[1] == 0xbe) return true; return false; } #ifdef CONFIG_BATMAN_ADV_BLA bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int packet_type); bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid); bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size); int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb); void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif); void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); #ifdef CONFIG_BATMAN_ADV_DAT bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); #endif #define BATADV_BLA_CRC_INIT 0 #else /* ifdef CONFIG_BATMAN_ADV_BLA */ static inline bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int packet_type) { return false; } static inline bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { return false; } static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size) { return false; } static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) { return false; } static inline bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { return false; } static inline void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif) { } static inline int batadv_bla_init(struct batadv_priv *bat_priv) { return 1; } static inline void batadv_bla_free(struct batadv_priv *bat_priv) { } static inline int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) { return -EOPNOTSUPP; } static inline int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) { return -EOPNOTSUPP; } static inline bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { return true; } #endif /* ifdef CONFIG_BATMAN_ADV_BLA */ #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */
12002 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 /* * include/linux/topology.h * * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H #include <linux/arch_topology.h> #include <linux/cpumask.h> #include <linux/bitops.h> #include <linux/mmzone.h> #include <linux/smp.h> #include <linux/percpu.h> #include <asm/topology.h> #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif #define for_each_node_with_cpus(node) \ for_each_online_node(node) \ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 #define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif /* * The following tunable allows platforms to override the default node * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are * sufficiently fast that the default value actually hurts * performance. * * AMD EPYC machines use this because even though the 2-hop distance * is 32 (3.2x slower than a local memory access) performance actually * *improves* if allowed to reclaim memory and load balance tasks * between NUMA nodes 2-hops apart. */ extern int __read_mostly node_reclaim_distance; #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); #ifndef numa_node_id /* Returns the number of the current Node. */ static inline int numa_node_id(void) { return raw_cpu_read(numa_node); } #endif #ifndef cpu_to_node static inline int cpu_to_node(int cpu) { return per_cpu(numa_node, cpu); } #endif #ifndef set_numa_node static inline void set_numa_node(int node) { this_cpu_write(numa_node, node); } #endif #ifndef set_cpu_numa_node static inline void set_cpu_numa_node(int cpu, int node) { per_cpu(numa_node, cpu) = node; } #endif #else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ /* Returns the number of the current Node. */ #ifndef numa_node_id static inline int numa_node_id(void) { return cpu_to_node(raw_smp_processor_id()); } #endif #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); } #endif #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return raw_cpu_read(_numa_mem_); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return per_cpu(_numa_mem_, cpu); } #endif #ifndef set_cpu_numa_mem static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; } #endif #else /* !CONFIG_HAVE_MEMORYLESS_NODES */ #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return numa_node_id(); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return cpu_to_node(cpu); } #endif #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif #if defined(topology_book_id) && defined(topology_book_cpumask) #define TOPOLOGY_BOOK_SYSFS #endif #if defined(topology_drawer_id) && defined(topology_drawer_cpumask) #define TOPOLOGY_DRAWER_SYSFS #endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_cluster_id #define topology_cluster_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif #ifndef topology_book_id #define topology_book_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_drawer_id #define topology_drawer_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_ppin #define topology_ppin(cpu) ((void)(cpu), 0ull) #endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_cluster_cpumask #define topology_cluster_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_book_cpumask #define topology_book_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_drawer_cpumask #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } #endif static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } #ifdef CONFIG_NUMA int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * sched_numa_hop_mask(unsigned int node, unsigned int hops) { return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NUMA */ /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. * @node: the NUMA node to start the search from. * * Requires rcu_lock to be held. * * Yields cpu_online_mask for @node == NUMA_NO_NODE. */ #define for_each_numa_hop_mask(mask, node) \ for (unsigned int __hops = 0; \ mask = (node != NUMA_NO_NODE || __hops) ? \ sched_numa_hop_mask(node, __hops) : \ cpu_online_mask, \ !IS_ERR_OR_NULL(mask); \ __hops++) #endif /* _LINUX_TOPOLOGY_H */
212 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2019, Tessares SA. */ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/netns/generic.h> #include "protocol.h" #define MPTCP_SYSCTL_PATH "net/mptcp" static int mptcp_pernet_id; #ifdef CONFIG_SYSCTL static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX; #endif struct mptcp_pernet { #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; #endif unsigned int add_addr_timeout; unsigned int close_timeout; unsigned int stale_loss_cnt; u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; u8 pm_type; char scheduler[MPTCP_SCHED_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) { return net_generic(net, mptcp_pernet_id); } int mptcp_is_enabled(const struct net *net) { return mptcp_get_pernet(net)->mptcp_enabled; } unsigned int mptcp_get_add_addr_timeout(const struct net *net) { return mptcp_get_pernet(net)->add_addr_timeout; } int mptcp_is_checksum_enabled(const struct net *net) { return mptcp_get_pernet(net)->checksum_enabled; } int mptcp_allow_join_id0(const struct net *net) { return mptcp_get_pernet(net)->allow_join_initial_addr_port; } unsigned int mptcp_stale_loss_cnt(const struct net *net) { return mptcp_get_pernet(net)->stale_loss_cnt; } unsigned int mptcp_close_timeout(const struct sock *sk) { if (sock_flag(sk, SOCK_DEAD)) return TCP_TIMEWAIT_LEN; return mptcp_get_pernet(sock_net(sk))->close_timeout; } int mptcp_get_pm_type(const struct net *net) { return mptcp_get_pernet(net)->pm_type; } const char *mptcp_get_scheduler(const struct net *net) { return mptcp_get_pernet(net)->scheduler; } static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; pernet->close_timeout = TCP_TIMEWAIT_LEN; pernet->checksum_enabled = 0; pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; strcpy(pernet->scheduler, "default"); } #ifdef CONFIG_SYSCTL static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", .maxlen = sizeof(u8), .mode = 0644, /* users with CAP_NET_ADMIN or root (not and) can change this * value, same as other sysctl or the 'net' tree. */ .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "add_addr_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "checksum_enabled", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "allow_join_initial_addr_port", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "stale_loss_cnt", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, }, { .procname = "pm_type", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, { .procname = "scheduler", .maxlen = MPTCP_SCHED_NAME_MAX, .mode = 0644, .proc_handler = proc_dostring, }, { .procname = "close_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, {} }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { struct ctl_table_header *hdr; struct ctl_table *table; table = mptcp_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; table[2].data = &pernet->checksum_enabled; table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; table[5].data = &pernet->pm_type; table[6].data = &pernet->scheduler; table[7].data = &pernet->close_timeout; hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); if (!hdr) goto err_reg; pernet->ctl_table_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) { struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; unregister_net_sysctl_table(pernet->ctl_table_hdr); kfree(table); } #else static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { return 0; } static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} #endif /* CONFIG_SYSCTL */ static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); mptcp_pernet_set_defaults(pernet); return mptcp_pernet_new_table(net, pernet); } /* Note: the callback will only be called per extra netns */ static void __net_exit mptcp_net_exit(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); mptcp_pernet_del_table(pernet); } static struct pernet_operations mptcp_pernet_ops = { .init = mptcp_net_init, .exit = mptcp_net_exit, .id = &mptcp_pernet_id, .size = sizeof(struct mptcp_pernet), }; void __init mptcp_init(void) { mptcp_join_cookie_init(); mptcp_proto_init(); if (register_pernet_subsys(&mptcp_pernet_ops) < 0) panic("Failed to register MPTCP pernet subsystem.\n"); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcpv6_init(void) { int err; err = mptcp_proto_v6_init(); return err; } #endif
13133 13138 13137 13135 13133 13130 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/page_ext.h> #include <linux/memory.h> #include <linux/vmalloc.h> #include <linux/kmemleak.h> #include <linux/page_owner.h> #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate.h> /* * struct page extension * * This is the feature to manage memory for extended data per page. * * Until now, we must modify struct page itself to store extra data per page. * This requires rebuilding the kernel and it is really time consuming process. * And, sometimes, rebuild is impossible due to third party module dependency. * At last, enlarging struct page could cause un-wanted system behaviour change. * * This feature is intended to overcome above mentioned problems. This feature * allocates memory for extended data per page in certain place rather than * the struct page itself. This memory can be accessed by the accessor * functions provided by this code. During the boot process, it checks whether * allocation of huge chunk of memory is needed or not. If not, it avoids * allocating memory at all. With this advantage, we can include this feature * into the kernel in default and can avoid rebuild and solve related problems. * * To help these things to work well, there are two callbacks for clients. One * is the need callback which is mandatory if user wants to avoid useless * memory allocation at boot-time. The other is optional, init callback, which * is used to do proper initialization after memory is allocated. * * The need callback is used to decide whether extended memory allocation is * needed or not. Sometimes users want to deactivate some features in this * boot and extra memory would be unnecessary. In this case, to avoid * allocating huge chunk of memory, each clients represent their need of * extra memory through the need callback. If one of the need callbacks * returns true, it means that someone needs extra memory so that * page extension core should allocates memory for page extension. If * none of need callbacks return true, memory isn't needed at all in this boot * and page extension core can skip to allocate memory. As result, * none of memory is wasted. * * When need callback returns true, page_ext checks if there is a request for * extra memory through size in struct page_ext_operations. If it is non-zero, * extra space is allocated for each page_ext entry and offset is returned to * user through offset in struct page_ext_operations. * * The init callback is used to do proper initialization after page extension * is completely initialized. In sparse memory system, extra memory is * allocated some time later than memmap is allocated. In other words, lifetime * of memory for page extension isn't same with memmap for struct page. * Therefore, clients can't store extra data until page extension is * initialized, even if pages are allocated and used freely. This could * cause inadequate state of extra data per page, so, to prevent it, client * can utilize this callback to initialize the state of it correctly. */ #ifdef CONFIG_SPARSEMEM #define PAGE_EXT_INVALID (0x1) #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { return true; } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, .need_shared_flags = true, }; #endif static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif }; unsigned long page_ext_size; static unsigned long total_usage; bool early_page_ext __meminitdata; static int __init setup_early_page_ext(char *str) { early_page_ext = true; return 0; } early_param("early_page_ext", setup_early_page_ext); static bool __init invoke_need_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); bool need = false; for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { if (page_ext_ops[i]->need_shared_flags) { page_ext_size = sizeof(struct page_ext); break; } } } for (i = 0; i < entries; i++) { if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; } } return need; } static void __init invoke_init_callbacks(void) { int i; int entries = ARRAY_SIZE(page_ext_ops); for (i = 0; i < entries; i++) { if (page_ext_ops[i]->init) page_ext_ops[i]->init(); } } static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } #ifndef CONFIG_SPARSEMEM void __init page_ext_init_flatmem_late(void) { invoke_init_callbacks(); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { pgdat->node_page_ext = NULL; } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (unlikely(!base)) return NULL; index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); } static int __init alloc_node_page_ext(int nid) { struct page_ext *base; unsigned long table_size; unsigned long nr_pages; nr_pages = NODE_DATA(nid)->node_spanned_pages; if (!nr_pages) return 0; /* * Need extra space if node range is not aligned with * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm * checks buddy's status, range could be out of exact node range. */ if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) nr_pages += MAX_ORDER_NR_PAGES; table_size = page_ext_size * nr_pages; base = memblock_alloc_try_nid( table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!base) return -ENOMEM; NODE_DATA(nid)->node_page_ext = base; total_usage += table_size; return 0; } void __init page_ext_init_flatmem(void) { int nid, fail; if (!invoke_need_callbacks()) return; for_each_online_node(nid) { fail = alloc_node_page_ext(nid); if (fail) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); return; fail: pr_crit("allocation of page_ext failed.\n"); panic("Out of memory"); } #else /* CONFIG_SPARSEMEM */ static bool page_ext_invalid(struct page_ext *page_ext) { return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); } static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); struct page_ext *page_ext = READ_ONCE(section->page_ext); WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ if (page_ext_invalid(page_ext)) return NULL; return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) { gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; void *addr = NULL; addr = alloc_pages_exact_nid(nid, size, flags); if (addr) { kmemleak_alloc(addr, size, 1, flags); return addr; } addr = vzalloc_node(size, nid); return addr; } static int __meminit init_section_page_ext(unsigned long pfn, int nid) { struct mem_section *section; struct page_ext *base; unsigned long table_size; section = __pfn_to_section(pfn); if (section->page_ext) return 0; table_size = page_ext_size * PAGES_PER_SECTION; base = alloc_page_ext(table_size, nid); /* * The value stored in section->page_ext is (base - pfn) * and it does not point to the memory block allocated above, * causing kmemleak false positives. */ kmemleak_not_leak(base); if (!base) { pr_err("page ext allocation failure\n"); return -ENOMEM; } /* * The passed "pfn" may not be aligned to SECTION. For the calculation * we need to apply a mask. */ pfn &= PAGE_SECTION_MASK; section->page_ext = (void *)base - page_ext_size * pfn; total_usage += table_size; return 0; } static void free_page_ext(void *addr) { if (is_vmalloc_addr(addr)) { vfree(addr); } else { struct page *page = virt_to_page(addr); size_t table_size; table_size = page_ext_size * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); kmemleak_free(addr); free_pages_exact(addr, table_size); } } static void __free_page_ext(unsigned long pfn) { struct mem_section *ms; struct page_ext *base; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; base = READ_ONCE(ms->page_ext); /* * page_ext here can be valid while doing the roll back * operation in online_page_ext(). */ if (page_ext_invalid(base)) base = (void *)base - PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, NULL); base = get_entry(base, pfn); free_page_ext(base); } static void __invalidate_page_ext(unsigned long pfn) { struct mem_section *ms; void *val; ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; val = (void *)ms->page_ext + PAGE_EXT_INVALID; WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, unsigned long nr_pages, int nid) { unsigned long start, end, pfn; int fail = 0; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == NUMA_NO_NODE) { /* * In this case, "nid" already exists and contains valid memory. * "start_pfn" passed to us is a pfn which is an arg for * online__pages(), and start_pfn should exist. */ nid = pfn_to_nid(start_pfn); VM_BUG_ON(!node_online(nid)); } for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); if (!fail) return 0; /* rollback */ end = pfn - PAGES_PER_SECTION; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return -ENOMEM; } static void __meminit offline_page_ext(unsigned long start_pfn, unsigned long nr_pages) { unsigned long start, end, pfn; start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); /* * Freeing of page_ext is done in 3 steps to avoid * use-after-free of it: * 1) Traverse all the sections and mark their page_ext * as invalid. * 2) Wait for all the existing users of page_ext who * started before invalidation to finish. * 3) Free the page_ext. */ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __invalidate_page_ext(pfn); synchronize_rcu(); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); } static int __meminit page_ext_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; int ret = 0; switch (action) { case MEM_GOING_ONLINE: ret = online_page_ext(mn->start_pfn, mn->nr_pages, mn->status_change_nid); break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, mn->nr_pages); break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: case MEM_CANCEL_OFFLINE: break; } return notifier_from_errno(ret); } void __init page_ext_init(void) { unsigned long pfn; int nid; if (!invoke_need_callbacks()) return; for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; start_pfn = node_start_pfn(nid); end_pfn = node_end_pfn(nid); /* * start_pfn and end_pfn may not be aligned to SECTION and the * page->flags of out of node pages are not initialized. So we * scan [start_pfn, the biggest section's pfn < end_pfn) here. */ for (pfn = start_pfn; pfn < end_pfn; pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { if (!pfn_valid(pfn)) continue; /* * Nodes's pfns can be overlapping. * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... */ if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; cond_resched(); } } hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; oom: panic("Out of memory"); } void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { } #endif /** * page_ext_get() - Get the extended information for a page. * @page: The page we're interested in. * * Ensures that the page_ext will remain valid until page_ext_put() * is called. * * Return: NULL if no page_ext exists for this page. * Context: Any context. Caller may not sleep until they have called * page_ext_put(). */ struct page_ext *page_ext_get(struct page *page) { struct page_ext *page_ext; rcu_read_lock(); page_ext = lookup_page_ext(page); if (!page_ext) { rcu_read_unlock(); return NULL; } return page_ext; } /** * page_ext_put() - Working with page extended information is done. * @page_ext: Page extended information received from page_ext_get(). * * The page extended information of the page may not be valid after this * function is called. * * Return: None. * Context: Any context with corresponding page_ext_get() is called. */ void page_ext_put(struct page_ext *page_ext) { if (unlikely(!page_ext)) return; rcu_read_unlock(); }
14 14 14 43 43 43 41 35 35 35 35 21 21 16 21 20 16 14 12 11 11 11 14 14 14 14 14 14 14 31 35 35 31 31 31 35 35 31 30 33 31 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 36 35 34 35 35 35 35 32 35 32 32 35 35 35 32 32 32 31 32 35 42 3 1 41 40 4 3 2 43 42 41 41 38 38 38 38 37 36 1 36 35 36 33 38 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Oracle. All Rights Reserved. * * Author: Darrick J. Wong <darrick.wong@oracle.com> */ #include "ext4.h" #include <linux/fsmap.h> #include "fsmap.h" #include "mballoc.h" #include <linux/sort.h> #include <linux/list_sort.h> #include <trace/events/ext4.h> /* Convert an ext4_fsmap to an fsmap. */ void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, struct ext4_fsmap *src) { dest->fmr_device = src->fmr_device; dest->fmr_flags = src->fmr_flags; dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits; dest->fmr_owner = src->fmr_owner; dest->fmr_offset = 0; dest->fmr_length = src->fmr_length << sb->s_blocksize_bits; dest->fmr_reserved[0] = 0; dest->fmr_reserved[1] = 0; dest->fmr_reserved[2] = 0; } /* Convert an fsmap to an ext4_fsmap. */ void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, struct fsmap *src) { dest->fmr_device = src->fmr_device; dest->fmr_flags = src->fmr_flags; dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits; dest->fmr_owner = src->fmr_owner; dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits; } /* getfsmap query state */ struct ext4_getfsmap_info { struct ext4_fsmap_head *gfi_head; ext4_fsmap_format_t gfi_formatter; /* formatting fn */ void *gfi_format_arg;/* format buffer */ ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */ u32 gfi_dev; /* device id */ ext4_group_t gfi_agno; /* bg number, if applicable */ struct ext4_fsmap gfi_low; /* low rmap key */ struct ext4_fsmap gfi_high; /* high rmap key */ struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */ struct list_head gfi_meta_list; /* fixed metadata list */ bool gfi_last; /* last extent? */ }; /* Associate a device with a getfsmap handler. */ struct ext4_getfsmap_dev { int (*gfd_fn)(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info); u32 gfd_dev; }; /* Compare two getfsmap device handlers. */ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) { const struct ext4_getfsmap_dev *d1 = p1; const struct ext4_getfsmap_dev *d2 = p2; return d1->gfd_dev - d2->gfd_dev; } /* Compare a record against our starting point */ static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { return rec->fmr_physical < info->gfi_low.fmr_physical; } /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. */ static int ext4_getfsmap_helper(struct super_block *sb, struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { struct ext4_fsmap fmr; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t rec_fsblk = rec->fmr_physical; ext4_group_t agno; ext4_grpblk_t cno; int error; if (fatal_signal_pending(current)) return -EINTR; /* * Filter out records that start before our startpoint, if the * caller requested that. */ if (ext4_getfsmap_rec_before_low_key(info, rec)) { rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } /* Are we just counting mappings? */ if (info->gfi_head->fmh_count == 0) { if (info->gfi_head->fmh_entries == UINT_MAX) return EXT4_QUERY_RANGE_ABORT; if (rec_fsblk > info->gfi_next_fsblk) info->gfi_head->fmh_entries++; if (info->gfi_last) return EXT4_QUERY_RANGE_CONTINUE; info->gfi_head->fmh_entries++; rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } /* * If the record starts past the last physical block we saw, * then we've found a gap. Report the gap as being owned by * whatever the caller specified is the missing owner. */ if (rec_fsblk > info->gfi_next_fsblk) { if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) return EXT4_QUERY_RANGE_ABORT; ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk, &agno, &cno); trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), rec_fsblk - info->gfi_next_fsblk, EXT4_FMR_OWN_UNKNOWN); fmr.fmr_device = info->gfi_dev; fmr.fmr_physical = info->gfi_next_fsblk; fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN; fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; error = info->gfi_formatter(&fmr, info->gfi_format_arg); if (error) return error; info->gfi_head->fmh_entries++; } if (info->gfi_last) goto out; /* Fill out the extent we found */ if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) return EXT4_QUERY_RANGE_ABORT; ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno); trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), rec->fmr_length, rec->fmr_owner); fmr.fmr_device = info->gfi_dev; fmr.fmr_physical = rec_fsblk; fmr.fmr_owner = rec->fmr_owner; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; fmr.fmr_length = rec->fmr_length; error = info->gfi_formatter(&fmr, info->gfi_format_arg); if (error) return error; info->gfi_head->fmh_entries++; out: rec_fsblk += rec->fmr_length; if (info->gfi_next_fsblk < rec_fsblk) info->gfi_next_fsblk = rec_fsblk; return EXT4_QUERY_RANGE_CONTINUE; } static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) { return fmr->fmr_physical + fmr->fmr_length; } /* Transform a blockgroup's free record into a fsmap */ static int ext4_getfsmap_datadev_helper(struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t len, void *priv) { struct ext4_fsmap irec; struct ext4_getfsmap_info *info = priv; struct ext4_fsmap *p; struct ext4_fsmap *tmp; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t fsb; ext4_fsblk_t fslen; int error; fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno)); fslen = EXT4_C2B(sbi, len); /* If the retained free extent record is set... */ if (info->gfi_lastfree.fmr_owner) { /* ...and abuts this one, lengthen it and return. */ if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) { info->gfi_lastfree.fmr_length += fslen; return 0; } /* * There's a gap between the two free extents; emit the * retained extent prior to merging the meta_list. */ error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); if (error) return error; info->gfi_lastfree.fmr_owner = 0; } /* Merge in any relevant extents from the meta_list */ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { list_del(&p->fmr_list); kfree(p); } else if (p->fmr_physical < fsb) { error = ext4_getfsmap_helper(sb, info, p); if (error) return error; list_del(&p->fmr_list); kfree(p); } } irec.fmr_device = 0; irec.fmr_physical = fsb; irec.fmr_length = fslen; irec.fmr_owner = EXT4_FMR_OWN_FREE; irec.fmr_flags = 0; /* If this is a free extent at the end of a bg, buffer it. */ if (ext4_fsmap_next_pblk(&irec) == ext4_group_first_block_no(sb, agno + 1)) { info->gfi_lastfree = irec; return 0; } /* Otherwise, emit it */ return ext4_getfsmap_helper(sb, info, &irec); } /* Execute a getfsmap query against the log device. */ static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info) { journal_t *journal = EXT4_SB(sb)->s_journal; struct ext4_fsmap irec; /* Set up search keys */ info->gfi_low = keys[0]; info->gfi_low.fmr_length = 0; memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0, info->gfi_low.fmr_physical, info->gfi_low.fmr_length, info->gfi_low.fmr_owner); trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0, info->gfi_high.fmr_physical, info->gfi_high.fmr_length, info->gfi_high.fmr_owner); if (keys[0].fmr_physical > 0) return 0; /* Fabricate an rmap entry for the external log device. */ irec.fmr_physical = journal->j_blk_offset; irec.fmr_length = journal->j_total_len; irec.fmr_owner = EXT4_FMR_OWN_LOG; irec.fmr_flags = 0; return ext4_getfsmap_helper(sb, info, &irec); } /* Helper to fill out an ext4_fsmap. */ static inline int ext4_getfsmap_fill(struct list_head *meta_list, ext4_fsblk_t fsb, ext4_fsblk_t len, uint64_t owner) { struct ext4_fsmap *fsm; fsm = kmalloc(sizeof(*fsm), GFP_NOFS); if (!fsm) return -ENOMEM; fsm->fmr_device = 0; fsm->fmr_flags = 0; fsm->fmr_physical = fsb; fsm->fmr_owner = owner; fsm->fmr_length = len; list_add_tail(&fsm->fmr_list, meta_list); return 0; } /* * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, ext4_group_t agno, struct list_head *meta_list) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno); ext4_fsblk_t len; unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb); int error; /* Record the superblock. */ if (ext4_bg_has_super(sb, agno)) { error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS); if (error) return error; fsb++; } /* Record the group descriptors. */ len = ext4_bg_num_gdb(sb, agno); if (!len) return 0; error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_GDT); if (error) return error; fsb += len; /* Reserved GDT blocks */ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_RESV_GDT); if (error) return error; } return 0; } /* Compare two fsmap items. */ static int ext4_getfsmap_compare(void *priv, const struct list_head *a, const struct list_head *b) { struct ext4_fsmap *fa; struct ext4_fsmap *fb; fa = container_of(a, struct ext4_fsmap, fmr_list); fb = container_of(b, struct ext4_fsmap, fmr_list); if (fa->fmr_physical < fb->fmr_physical) return -1; else if (fa->fmr_physical > fb->fmr_physical) return 1; return 0; } /* Merge adjacent extents of fixed metadata. */ static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list) { struct ext4_fsmap *p; struct ext4_fsmap *prev = NULL; struct ext4_fsmap *tmp; list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { if (!prev) { prev = p; continue; } if (prev->fmr_owner == p->fmr_owner && prev->fmr_physical + prev->fmr_length == p->fmr_physical) { prev->fmr_length += p->fmr_length; list_del(&p->fmr_list); kfree(p); } else prev = p; } } /* Free a list of fixed metadata. */ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) { struct ext4_fsmap *p; struct ext4_fsmap *tmp; list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { list_del(&p->fmr_list); kfree(p); } } /* Find all the fixed metadata in the filesystem. */ static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, struct list_head *meta_list) { struct ext4_group_desc *gdp; ext4_group_t agno; int error; INIT_LIST_HEAD(meta_list); /* Collect everything. */ for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) { gdp = ext4_get_group_desc(sb, agno, NULL); if (!gdp) { error = -EFSCORRUPTED; goto err; } /* Superblock & GDT */ error = ext4_getfsmap_find_sb(sb, agno, meta_list); if (error) goto err; /* Block bitmap */ error = ext4_getfsmap_fill(meta_list, ext4_block_bitmap(sb, gdp), 1, EXT4_FMR_OWN_BLKBM); if (error) goto err; /* Inode bitmap */ error = ext4_getfsmap_fill(meta_list, ext4_inode_bitmap(sb, gdp), 1, EXT4_FMR_OWN_INOBM); if (error) goto err; /* Inodes */ error = ext4_getfsmap_fill(meta_list, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group, EXT4_FMR_OWN_INODES); if (error) goto err; } /* Sort the list */ list_sort(NULL, meta_list, ext4_getfsmap_compare); /* Merge adjacent extents */ ext4_getfsmap_merge_fixed_metadata(meta_list); return 0; err: ext4_getfsmap_free_fixed_metadata(meta_list); return error; } /* Execute a getfsmap query against the buddy bitmaps */ static int ext4_getfsmap_datadev(struct super_block *sb, struct ext4_fsmap *keys, struct ext4_getfsmap_info *info) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start_fsb; ext4_fsblk_t end_fsb; ext4_fsblk_t bofs; ext4_fsblk_t eofs; ext4_group_t start_ag; ext4_group_t end_ag; ext4_grpblk_t first_cluster; ext4_grpblk_t last_cluster; int error = 0; bofs = le32_to_cpu(sbi->s_es->s_first_data_block); eofs = ext4_blocks_count(sbi->s_es); if (keys[0].fmr_physical >= eofs) return 0; else if (keys[0].fmr_physical < bofs) keys[0].fmr_physical = bofs; if (keys[1].fmr_physical >= eofs) keys[1].fmr_physical = eofs - 1; if (keys[1].fmr_physical < keys[0].fmr_physical) return 0; start_fsb = keys[0].fmr_physical; end_fsb = keys[1].fmr_physical; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster); ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster); /* * Convert the fsmap low/high keys to bg based keys. Initialize * low to the fsmap low key and max out the high key to the end * of the bg. */ info->gfi_low = keys[0]; info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster); info->gfi_low.fmr_length = 0; memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); /* Assemble a list of all the fixed-location metadata. */ error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list); if (error) goto err; /* Query each bg */ for (info->gfi_agno = start_ag; info->gfi_agno <= end_ag; info->gfi_agno++) { /* * Set the bg high key from the fsmap high key if this * is the last bg that we're querying. */ if (info->gfi_agno == end_ag) { info->gfi_high = keys[1]; info->gfi_high.fmr_physical = EXT4_C2B(sbi, last_cluster); info->gfi_high.fmr_length = 0; } trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno, info->gfi_low.fmr_physical, info->gfi_low.fmr_length, info->gfi_low.fmr_owner); trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno, info->gfi_high.fmr_physical, info->gfi_high.fmr_length, info->gfi_high.fmr_owner); error = ext4_mballoc_query_range(sb, info->gfi_agno, EXT4_B2C(sbi, info->gfi_low.fmr_physical), EXT4_B2C(sbi, info->gfi_high.fmr_physical), ext4_getfsmap_datadev_helper, info); if (error) goto err; /* * Set the bg low key to the start of the bg prior to * moving on to the next bg. */ if (info->gfi_agno == start_ag) memset(&info->gfi_low, 0, sizeof(info->gfi_low)); } /* Do we have a retained free extent? */ if (info->gfi_lastfree.fmr_owner) { error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); if (error) goto err; } /* Report any gaps at the end of the bg */ info->gfi_last = true; error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); if (error) goto err; err: ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list); return error; } /* Do we recognize the device? */ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, struct ext4_fsmap *fm) { if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; if (EXT4_SB(sb)->s_journal_bdev_handle && fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev)) return true; return false; } /* Ensure that the low key is less than the high key. */ static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key, struct ext4_fsmap *high_key) { if (low_key->fmr_device > high_key->fmr_device) return false; if (low_key->fmr_device < high_key->fmr_device) return true; if (low_key->fmr_physical > high_key->fmr_physical) return false; if (low_key->fmr_physical < high_key->fmr_physical) return true; if (low_key->fmr_owner > high_key->fmr_owner) return false; if (low_key->fmr_owner < high_key->fmr_owner) return true; return false; } #define EXT4_GETFSMAP_DEVS 2 /* * Get filesystem's extents as described in head, and format for * output. Calls formatter to fill the user's buffer until all * extents are mapped, until the passed-in head->fmh_count slots have * been filled, or until the formatter short-circuits the loop, if it * is tracking filled-in extents on its own. * * Key to Confusion * ---------------- * There are multiple levels of keys and counters at work here: * _fsmap_head.fmh_keys -- low and high fsmap keys passed in; * these reflect fs-wide block addrs. * dkeys -- fmh_keys used to query each device; * these are fmh_keys but w/ the low key * bumped up by fmr_length. * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this * is how we detect gaps in the fsmap * records and report them. * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from * dkeys; used to query the free space. */ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, ext4_fsmap_format_t formatter, void *arg) { struct ext4_fsmap dkeys[2]; /* per-dev keys */ struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS]; struct ext4_getfsmap_info info = { NULL }; int i; int error = 0; if (head->fmh_iflags & ~FMH_IF_VALID) return -EINVAL; if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) || !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1])) return -EINVAL; head->fmh_entries = 0; /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; if (EXT4_SB(sb)->s_journal_bdev_handle) { handlers[1].gfd_dev = new_encode_dev( EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev), ext4_getfsmap_dev_compare, NULL); /* * To continue where we left off, we allow userspace to use the * last mapping from a previous call as the low key of the next. * This is identified by a non-zero length in the low key. We * have to increment the low key in this scenario to ensure we * don't return the same mapping again, and instead return the * very next mapping. * * Bump the physical offset as there can be no other mapping for * the same physical block range. */ dkeys[0] = head->fmh_keys[0]; dkeys[0].fmr_physical += dkeys[0].fmr_length; dkeys[0].fmr_owner = 0; dkeys[0].fmr_length = 0; memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap)); if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) return -EINVAL; info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; info.gfi_formatter = formatter; info.gfi_format_arg = arg; info.gfi_head = head; /* For each device we support... */ for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) { /* Is this device within the range the user asked for? */ if (!handlers[i].gfd_fn) continue; if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev) continue; if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev) break; /* * If this device number matches the high key, we have * to pass the high key to the handler to limit the * query results. If the device number exceeds the * low key, zero out the low key so that we get * everything from the beginning. */ if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device) dkeys[1] = head->fmh_keys[1]; if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device) memset(&dkeys[0], 0, sizeof(struct ext4_fsmap)); info.gfi_dev = handlers[i].gfd_dev; info.gfi_last = false; info.gfi_agno = -1; error = handlers[i].gfd_fn(sb, dkeys, &info); if (error) break; info.gfi_next_fsblk = 0; } head->fmh_oflags = FMH_OF_DEV_T; return error; }
182 181 182 136 161 1474 2 2 2 255 255 136 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_nat_masquerade.h> struct masq_dev_work { struct work_struct work; struct net *net; netns_tracker ns_tracker; union nf_inet_addr addr; int ifindex; int (*iter)(struct nf_conn *i, void *data); }; #define MAX_MASQ_WORKER_COUNT 16 static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; WARN_ON(hooknum != NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; rt = skb_rtable(skb); nh = rt_nexthop(rt, ip_hdr(skb)->daddr); newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", out->name); return NF_DROP; } nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; /* Transfer from original range. */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; /* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); static void iterate_cleanup_work(struct work_struct *work) { struct nf_ct_iter_data iter_data = {}; struct masq_dev_work *w; w = container_of(work, struct masq_dev_work, work); iter_data.net = w->net; iter_data.data = (void *)w; nf_ct_iterate_cleanup_net(w->iter, &iter_data); put_net_track(w->net, &w->ns_tracker); kfree(w); atomic_dec(&masq_worker_count); module_put(THIS_MODULE); } /* Iterate conntrack table in the background and remove conntrack entries * that use the device/address being removed. * * In case too many work items have been queued already or memory allocation * fails iteration is skipped, conntrack entries will time out eventually. */ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, int ifindex, int (*iter)(struct nf_conn *i, void *data), gfp_t gfp_flags) { struct masq_dev_work *w; if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) return; net = maybe_get_net(net); if (!net) return; if (!try_module_get(THIS_MODULE)) goto err_module; w = kzalloc(sizeof(*w), gfp_flags); if (w) { /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ atomic_inc(&masq_worker_count); INIT_WORK(&w->work, iterate_cleanup_work); w->ifindex = ifindex; w->net = net; netns_tracker_alloc(net, &w->ns_tracker, gfp_flags); w->iter = iter; if (addr) w->addr = *addr; schedule_work(&w->work); return; } module_put(THIS_MODULE); err_module: put_net(net); } static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); const struct masq_dev_work *w = arg; if (!nat) return 0; return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for * conntracks which were associated with that device, * and forget them. */ nf_nat_masq_schedule(net, NULL, dev->ifindex, device_cmp, GFP_KERNEL); } return NOTIFY_DONE; } static int inet_cmp(struct nf_conn *ct, void *ptr) { struct nf_conntrack_tuple *tuple; struct masq_dev_work *w = ptr; if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct in_ifaddr *ifa = ptr; const struct in_device *idev; const struct net_device *dev; union nf_inet_addr addr; if (event != NETDEV_DOWN) return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; memset(&addr, 0, sizeof(addr)); addr.ip = ifa->ifa_address; dev = idev->dev; nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr) { #ifdef CONFIG_IPV6_MODULE const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (!v6_ops) return -EHOSTUNREACH; return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr); #else return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr); #endif } unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out) { enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; struct in6_addr src; struct nf_conn *ct; struct nf_nat_range2 newrange; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); if (nat_ipv6_dev_get_saddr(nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &src) < 0) return NF_DROP; nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.in6 = src; newrange.max_addr.in6 = src; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. * * As we can have 'a lot' of inet_events (depending on amount of ipv6 * addresses being deleted), we also need to limit work item queue. */ static int masq_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; union nf_inet_addr addr; if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; memset(&addr, 0, sizeof(addr)); addr.in6 = ifa->addr; nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, GFP_ATOMIC); return NOTIFY_DONE; } static struct notifier_block masq_inet6_notifier = { .notifier_call = masq_inet6_event, }; static int nf_nat_masquerade_ipv6_register_notifier(void) { return register_inet6addr_notifier(&masq_inet6_notifier); } #else static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; } #endif int nf_nat_masquerade_inet_register_notifiers(void) { int ret = 0; mutex_lock(&masq_mutex); if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) { ret = -EOVERFLOW; goto out_unlock; } /* check if the notifier was already set */ if (++masq_refcnt > 1) goto out_unlock; /* Register for device down reports */ ret = register_netdevice_notifier(&masq_dev_notifier); if (ret) goto err_dec; /* Register IP address change reports */ ret = register_inetaddr_notifier(&masq_inet_notifier); if (ret) goto err_unregister; ret = nf_nat_masquerade_ipv6_register_notifier(); if (ret) goto err_unreg_inet; mutex_unlock(&masq_mutex); return ret; err_unreg_inet: unregister_inetaddr_notifier(&masq_inet_notifier); err_unregister: unregister_netdevice_notifier(&masq_dev_notifier); err_dec: masq_refcnt--; out_unlock: mutex_unlock(&masq_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers); void nf_nat_masquerade_inet_unregister_notifiers(void) { mutex_lock(&masq_mutex); /* check if the notifiers still have clients */ if (--masq_refcnt > 0) goto out_unlock; unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&masq_inet6_notifier); #endif out_unlock: mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers);
30 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/seq_file.h> #include <net/ip.h> #include <net/mptcp.h> #include <net/snmp.h> #include <net/net_namespace.h> #include "mib.h" static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA), SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX), SNMP_MIB_ITEM("AddAddrTxDrop", MPTCP_MIB_ADDADDRTXDROP), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("EchoAddTx", MPTCP_MIB_ECHOADDTX), SNMP_MIB_ITEM("EchoAddTxDrop", MPTCP_MIB_ECHOADDTXDROP), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmAddrTx", MPTCP_MIB_RMADDRTX), SNMP_MIB_ITEM("RmAddrTxDrop", MPTCP_MIB_RMADDRTXDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX), SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_SENTINEL }; /* mptcp_mib_alloc - allocate percpu mib counters * * These are allocated when the first mptcp socket is created so * we do not waste percpu memory if mptcp isn't in use. */ bool mptcp_mib_alloc(struct net *net) { struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib); if (!mib) return false; if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib)) free_percpu(mib); return true; } void mptcp_seq_show(struct seq_file *seq) { unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1]; struct net *net = seq->private; int i; seq_puts(seq, "MPTcpExt:"); for (i = 0; mptcp_snmp_list[i].name; i++) seq_printf(seq, " %s", mptcp_snmp_list[i].name); seq_puts(seq, "\nMPTcpExt:"); memset(sum, 0, sizeof(sum)); if (net->mib.mptcp_statistics) snmp_get_cpu_field_batch(sum, mptcp_snmp_list, net->mib.mptcp_statistics); for (i = 0; mptcp_snmp_list[i].name; i++) seq_printf(seq, " %lu", sum[i]); seq_putc(seq, '\n'); }
485 9 4094 1472 1438 386 6 436 557 1541 5763 1346 3 2773 427 33 213 332 268 250 333 337 265 258 8 34 6390 5660 6068 23 7 913 234 381 15 312 76 892 2 507 4 48 6 6 24 24 1421 805 227 1337 223 61 31 12 44 29 50 1095 1410 1239 14 4521 3412 39 2 77 255 613 5369 107 1593 7 5 1378 3429 3869 2095 66 189 8 195 1406 3023 1755 39 18 293 276 4 6 221 802 2 1112 177 544 1653 286 269 67 350 24 571 326 40 275 147 38 1 4 58 42 97 975 168 153 169 73 2 4 2227 942 178 2410 2 49 396 381 40 12 126 18 172 216 221 6 342 214 40 3 1 2 159 2 2 17 167 1 2 838 24 3 4 351 1604 308 328 141 141 188 80 757 897 5 73 11 362 10 1 1 1 1 1 1 12 3 1 3 2 5 6150 1221 15 5894 3020 13 13 31 3258 19 30 309 508 2914 187 188 100 3193 199 13 358 11 1199 5 4 23 272 544 788 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct sk_buff' memory handlers. * * Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Florian La Roche, <rzsfl@rz.uni-sb.de> */ #ifndef _LINUX_SKBUFF_H #define _LINUX_SKBUFF_H #include <linux/kernel.h> #include <linux/compiler.h> #include <linux/time.h> #include <linux/bug.h> #include <linux/bvec.h> #include <linux/cache.h> #include <linux/rbtree.h> #include <linux/socket.h> #include <linux/refcount.h> #include <linux/atomic.h> #include <asm/types.h> #include <linux/spinlock.h> #include <net/checksum.h> #include <linux/rcupdate.h> #include <linux/dma-mapping.h> #include <linux/netdev_features.h> #include <net/flow_dissector.h> #include <linux/in6.h> #include <linux/if_packet.h> #include <linux/llist.h> #include <net/flow.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_common.h> #endif #include <net/net_debug.h> #include <net/dropreason-core.h> /** * DOC: skb checksums * * The interface for checksum offload between the stack and networking drivers * is as follows... * * IP checksum related features * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Drivers advertise checksum offload capabilities in the features of a device. * From the stack's point of view these are capabilities offered by the driver. * A driver typically only advertises features that it is capable of offloading * to its device. * * .. flat-table:: Checksum related device features * :widths: 1 10 * * * - %NETIF_F_HW_CSUM * - The driver (or its device) is able to compute one * IP (one's complement) checksum for any combination * of protocols or protocol layering. The checksum is * computed and set in a packet per the CHECKSUM_PARTIAL * interface (see below). * * * - %NETIF_F_IP_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv4. These are specifically * unencapsulated packets of the form IPv4|TCP or * IPv4|UDP where the Protocol field in the IPv4 header * is TCP or UDP. The IPv4 header may contain IP options. * This feature cannot be set in features for a device * with NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). * * * - %NETIF_F_RXCSUM * - Driver (device) performs receive checksum offload. * This flag is only used to disable the RX checksum * feature for a device. The stack will accept receive * checksum indication in packets received on a device * regardless of whether NETIF_F_RXCSUM is set. * * Checksumming of received packets by device * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Indication of checksum verification is set in &sk_buff.ip_summed. * Possible values are: * * - %CHECKSUM_NONE * * Device did not checksum this packet e.g. due to lack of capabilities. * The packet contains full (though not verified) checksum in packet but * not in skb->csum. Thus, skb->csum is undefined in this case. * * - %CHECKSUM_UNNECESSARY * * The hardware you're dealing with doesn't calculate the full checksum * (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums * for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY * if their checksums are okay. &sk_buff.csum is still undefined in this case * though. A driver or device must never modify the checksum field in the * packet even if checksum is verified. * * %CHECKSUM_UNNECESSARY is applicable to following protocols: * * - TCP: IPv6 and IPv4. * - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a * zero UDP checksum for either IPv4 or IPv6, the networking stack * may perform further validation in this case. * - GRE: only if the checksum is present in the header. * - SCTP: indicates the CRC in SCTP header has been validated. * - FCOE: indicates the CRC in FC frame has been validated. * * &sk_buff.csum_level indicates the number of consecutive checksums found in * the packet minus one that have been verified as %CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), * GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to * two. If the device were only able to verify the UDP checksum and not * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * * - %CHECKSUM_COMPLETE * * This is the most generic way. The device supplied checksum of the _whole_ * packet as seen by netif_rx() and fills in &sk_buff.csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: * * - Even if device supports only some protocols, but is able to produce * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY. * - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols. * * - %CHECKSUM_PARTIAL * * A checksum is set up to be offloaded to a device as described in the * output description for CHECKSUM_PARTIAL. This may occur on a packet * received directly from another Linux OS, e.g., a virtualized Linux kernel * on the same host, or it may be set in the input path in GRO or remote * checksum offload. For the purposes of checksum verification, the checksum * referred to by skb->csum_start + skb->csum_offset and any preceding * checksums in the packet are considered verified. Any checksums in the * packet that are after the checksum being offloaded are not considered to * be verified. * * Checksumming on transmit for non-GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * The stack requests checksum offload in the &sk_buff.ip_summed for a packet. * Values are: * * - %CHECKSUM_PARTIAL * * The driver is required to checksum the packet as seen by hard_start_xmit() * from &sk_buff.csum_start up to the end, and to record/write the checksum at * offset &sk_buff.csum_start + &sk_buff.csum_offset. * A driver may verify that the * csum_start and csum_offset values are valid values given the length and * offset of the packet, but it should not attempt to validate that the * checksum refers to a legitimate transport layer checksum -- it is the * purview of the stack to validate that csum_start and csum_offset are set * correctly. * * When the stack requests checksum offload for a packet, the driver MUST * ensure that the checksum is set correctly. A driver can either offload the * checksum calculation to the device, or call skb_checksum_help (in the case * that the device does not support offload for a particular checksum). * * %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of * %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate * checksum offload capability. * skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based * on network device checksumming capabilities: if a packet does not match * them, skb_checksum_help() or skb_crc32c_help() (depending on the value of * &sk_buff.csum_not_inet, see :ref:`crc`) * is called to resolve the checksum. * * - %CHECKSUM_NONE * * The skb was already checksummed by the protocol, or a checksum is not * required. * * - %CHECKSUM_UNNECESSARY * * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * * - %CHECKSUM_COMPLETE * * Not used in checksum output. If a driver observes a packet with this value * set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set. * * .. _crc: * * Non-IP checksum (CRC) offloads * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * .. flat-table:: * :widths: 1 10 * * * - %NETIF_F_SCTP_CRC * - This feature indicates that a device is capable of * offloading the SCTP CRC in a packet. To perform this offload the stack * will set csum_start and csum_offset accordingly, set ip_summed to * %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication * in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c. * A driver that supports both IP checksum offload and SCTP CRC32c offload * must verify which offload is configured for a packet by testing the * value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to * resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1. * * * - %NETIF_F_FCOE_CRC * - This feature indicates that a device is capable of offloading the FCOE * CRC in a packet. To perform this offload the stack will set ip_summed * to %CHECKSUM_PARTIAL and set csum_start and csum_offset * accordingly. Note that there is no indication in the skbuff that the * %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports * both IP checksum offload and FCOE CRC offload must verify which offload * is configured for a packet, presumably by inspecting packet headers. * * Checksumming on output with GSO * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * In the case of a GSO packet (skb_is_gso() is true), checksum offload * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and * csum_offset are set to refer to the outermost checksum being offloaded * (two offloaded checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ #define CHECKSUM_NONE 0 #define CHECKSUM_UNNECESSARY 1 #define CHECKSUM_COMPLETE 2 #define CHECKSUM_PARTIAL 3 /* Maximum value in skb->csum_level */ #define SKB_MAX_CSUM_LEVEL 3 #define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES) #define SKB_WITH_OVERHEAD(X) \ ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) /* For X bytes available in skb->head, what is the minimal * allocation needed, knowing struct skb_shared_info needs * to be aligned. */ #define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) #define SKB_MAX_ORDER(X, ORDER) \ SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X)) #define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0)) #define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2)) /* return minimum truesize of one skb containing X bytes of data */ #define SKB_TRUESIZE(X) ((X) + \ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) struct ahash_request; struct net_device; struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; struct skb_ext; struct ts_config; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE } orig_proto:8; u8 pkt_otherhost:1; u8 in_prerouting:1; u8 bridged_dnat:1; u8 sabotage_in_done:1; __u16 frag_max_size; struct net_device *physindev; /* always valid & non-NULL from FORWARD on, for physdev match */ struct net_device *physoutdev; union { /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; struct in6_addr ipv6_daddr; /* after prerouting + nat detected: store original source * mac since neigh resolution overwrites it, only used while * skb is out in neigh layer. */ char neigh_header[8]; }; }; #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) /* Chain in tc_skb_ext will be used to share the tc chain with * ovs recirc_id. It will be set to the current chain by tc * and read by ovs to recirc_id. */ struct tc_skb_ext { union { u64 act_miss_cookie; __u32 chain; }; __u16 mru; __u16 zone; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; u8 act_miss:1; /* Set if act_miss_cookie is used */ u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */ }; #endif struct sk_buff_head { /* These two members must be first to match sk_buff. */ struct_group_tagged(sk_buff_list, list, struct sk_buff *next; struct sk_buff *prev; ); __u32 qlen; spinlock_t lock; }; struct sk_buff; #ifndef CONFIG_MAX_SKB_FRAGS # define CONFIG_MAX_SKB_FRAGS 17 #endif #define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS extern int sysctl_max_skb_frags; /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to * segment using its current segmentation instead. */ #define GSO_BY_FRAGS 0xFFFF typedef struct bio_vec skb_frag_t; /** * skb_frag_size() - Returns the size of a skb fragment * @frag: skb fragment */ static inline unsigned int skb_frag_size(const skb_frag_t *frag) { return frag->bv_len; } /** * skb_frag_size_set() - Sets the size of a skb fragment * @frag: skb fragment * @size: size of fragment */ static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) { frag->bv_len = size; } /** * skb_frag_size_add() - Increments the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_size_add(skb_frag_t *frag, int delta) { frag->bv_len += delta; } /** * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta * @frag: skb fragment * @delta: value to subtract */ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta) { frag->bv_len -= delta; } /** * skb_frag_must_loop - Test if %p is a high memory page * @p: fragment's page */ static inline bool skb_frag_must_loop(struct page *p) { #if defined(CONFIG_HIGHMEM) if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p)) return true; #endif return false; } /** * skb_frag_foreach_page - loop over pages in a fragment * * @f: skb frag to operate on * @f_off: offset from start of f->bv_page * @f_len: length from f_off to loop over * @p: (temp var) current page * @p_off: (temp var) offset from start of current page, * non-zero only on first page. * @p_len: (temp var) length in current page, * < PAGE_SIZE only on first and last page. * @copied: (temp var) length so far, excluding current p_len. * * A fragment can hold a compound page, in which case per-page * operations, notably kmap_atomic, must be called for each * regular page. */ #define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied) \ for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT), \ p_off = (f_off) & (PAGE_SIZE - 1), \ p_len = skb_frag_must_loop(p) ? \ min_t(u32, f_len, PAGE_SIZE - p_off) : f_len, \ copied = 0; \ copied < f_len; \ copied += p_len, p++, p_off = 0, \ p_len = min_t(u32, f_len - copied, PAGE_SIZE)) \ /** * struct skb_shared_hwtstamps - hardware time stamps * @hwtstamp: hardware time stamp transformed into duration * since arbitrary point in time * @netdev_data: address/cookie of network device driver used as * reference to actual hardware time stamp * * Software time stamps generated by ktime_get_real() are stored in * skb->tstamp. * * hwtstamps can only be compared against other hwtstamps from * the same device. * * This structure is attached to packets as part of the * &skb_shared_info. Use skb_hwtstamps() to get a pointer. */ struct skb_shared_hwtstamps { union { ktime_t hwtstamp; void *netdev_data; }; }; /* Definitions for tx_flags in struct skb_shared_info */ enum { /* generate hardware time stamp */ SKBTX_HW_TSTAMP = 1 << 0, /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, /* device driver is going to provide hardware time stamp */ SKBTX_IN_PROGRESS = 1 << 2, /* generate hardware time stamp based on cycles if supported */ SKBTX_HW_TSTAMP_USE_CYCLES = 1 << 3, /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, /* determine hardware time stamp based on time or cycles */ SKBTX_HW_TSTAMP_NETDEV = 1 << 5, /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) /* Definitions for flags in struct skb_shared_info */ enum { /* use zcopy routines */ SKBFL_ZEROCOPY_ENABLE = BIT(0), /* This indicates at least one fragment might be overwritten * (as in vmsplice(), sendfile() ...) * If we need to compute a TX checksum, we'll need to copy * all frags to avoid possible bad checksum */ SKBFL_SHARED_FRAG = BIT(1), /* segment contains only zerocopy data and should not be * charged to the kernel memory. */ SKBFL_PURE_ZEROCOPY = BIT(2), SKBFL_DONT_ORPHAN = BIT(3), /* page references are managed by the ubuf_info, so it's safe to * use frags only up until ubuf_info is released */ SKBFL_MANAGED_FRAG_REFS = BIT(4), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. * The zerocopy_success argument is true if zero copy transmit occurred, * false on data copy or out of memory error caused by data copy attempt. * The ctx field is used to track device context. * The desc field is used to track userspace buffer index. */ struct ubuf_info { void (*callback)(struct sk_buff *, struct ubuf_info *, bool zerocopy_success); refcount_t refcnt; u8 flags; }; struct ubuf_info_msgzc { struct ubuf_info ubuf; union { struct { unsigned long desc; void *ctx; }; struct { u32 id; u16 len; u16 zerocopy:1; u32 bytelen; }; }; struct mmpin { struct user_struct *user; unsigned int num_pg; } mmp; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) #define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \ ubuf) int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); /* Preserve some data across TX submission and completion. * * Note, this state is stored in the driver. Extending the layout * might need some special care. */ struct xsk_tx_metadata_compl { __u64 *tx_timestamp; }; /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ struct skb_shared_info { __u8 flags; __u8 meta_len; __u8 nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; union { struct skb_shared_hwtstamps hwtstamps; struct xsk_tx_metadata_compl xsk_meta; }; unsigned int gso_type; u32 tskey; /* * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; unsigned int xdp_frags_size; /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ void * destructor_arg; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; }; /** * DOC: dataref and headerless skbs * * Transport layers send out clones of payload skbs they hold for * retransmissions. To allow lower layers of the stack to prepend their headers * we split &skb_shared_info.dataref into two halves. * The lower 16 bits count the overall number of references. * The higher 16 bits indicate how many of the references are payload-only. * skb_header_cloned() checks if skb is allowed to add / write the headers. * * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr * (via __skb_header_release()). Any clone created from marked skb will get * &sk_buff.hdr_len populated with the available headroom. * If there's the only clone in existence it's able to modify the headroom * at will. The sequence of calls inside the transport layer is:: * * <alloc skb> * skb_reserve() * __skb_header_release() * skb_clone() * // send the clone down the stack * * This is not a very generic construct and it depends on the transport layers * doing the right thing. In practice there's usually only one payload-only skb. * Having multiple payload-only skbs with different lengths of hdr_len is not * possible. The payload-only skbs should never leave their owner. */ #define SKB_DATAREF_SHIFT 16 #define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1) enum { SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */ SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */ SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */ }; enum { SKB_GSO_TCPV4 = 1 << 0, /* This indicates the skb is from an untrusted source. */ SKB_GSO_DODGY = 1 << 1, /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, SKB_GSO_FCOE = 1 << 5, SKB_GSO_GRE = 1 << 6, SKB_GSO_GRE_CSUM = 1 << 7, SKB_GSO_IPXIP4 = 1 << 8, SKB_GSO_IPXIP6 = 1 << 9, SKB_GSO_UDP_TUNNEL = 1 << 10, SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11, SKB_GSO_PARTIAL = 1 << 12, SKB_GSO_TUNNEL_REMCSUM = 1 << 13, SKB_GSO_SCTP = 1 << 14, SKB_GSO_ESP = 1 << 15, SKB_GSO_UDP = 1 << 16, SKB_GSO_UDP_L4 = 1 << 17, SKB_GSO_FRAGLIST = 1 << 18, }; #if BITS_PER_LONG > 32 #define NET_SKBUFF_DATA_USES_OFFSET 1 #endif #ifdef NET_SKBUFF_DATA_USES_OFFSET typedef unsigned int sk_buff_data_t; #else typedef unsigned char *sk_buff_data_t; #endif /** * DOC: Basic sk_buff geometry * * struct sk_buff itself is a metadata structure and does not hold any packet * data. All the data is held in associated buffers. * * &sk_buff.head points to the main "head" buffer. The head buffer is divided * into two parts: * * - data buffer, containing headers and sometimes payload; * this is the part of the skb operated on by the common helpers * such as skb_put() or skb_pull(); * - shared info (struct skb_shared_info) which holds an array of pointers * to read-only data in the (page, offset, length) format. * * Optionally &skb_shared_info.frag_list may point to another skb. * * Basic diagram may look like this:: * * --------------- * | sk_buff | * --------------- * ,--------------------------- + head * / ,----------------- + data * / / ,----------- + tail * | | | , + end * | | | | * v v v v * ----------------------------------------------- * | headroom | data | tailroom | skb_shared_info | * ----------------------------------------------- * + [page frag] * + [page frag] * + [page frag] * + [page frag] --------- * + frag_list --> | sk_buff | * --------- * */ /** * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived/left * @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp * @list: queue head * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in * fragmentation management * @dev: Device we arrived on/are leaving by * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) * @sp: the security path, used for xfrm * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header * @hdr_len: writable header length of cloned skb * @csum: Checksum (must include start/offset pair) * @csum_start: Offset from skb->head where checksumming should start * @csum_offset: Offset from csum_start where checksum should be stored * @priority: Packet queueing priority * @ignore_df: allow local fragmentation * @cloned: Head may be cloned (check refcnt to be sure) * @ip_summed: Driver fed us an IP checksum * @nohdr: Payload reference only, must not modify header * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs * @inner_protocol_type: whether the inner protocol is * ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO * @remcsum_offload: remote checksum offload is enabled * @offload_fwd_mark: Packet was L2-forwarded in hardware * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device * @tc_at_ingress: used within tc_classify to distinguish in/egress * @redirected: packet was redirected by packet classifier * @from_ingress: packet was redirected from the ingress path * @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h * @peeked: this packet has been seen already, so stats have been * done for it, don't do them again * @nf_trace: netfilter packet trace flag * @protocol: Packet protocol from driver * @destructor: Destruct function * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @head_frag: skb was allocated from page fragments, * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @pp_recycle: mark the packet for recycling instead of freeing (implies * page_pool support on driver) * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport * ports. * @sw_hash: indicates hash was computed in software stack * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @encapsulation: indicates the inner headers in the skbuff are valid * @encap_hdr_csum: software checksum is needed * @csum_valid: checksum is already valid * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL * @csum_complete_sw: checksum was completed by software * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required * @mono_delivery_time: When set, skb->tstamp has the * delivery_time in mono clock base (i.e. EDT). Otherwise, the * skb->tstamp has the (rcv) timestamp at ingress and * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) * @inner_ipproto: (aka @inner_protocol) stores ipproto when * skb->inner_protocol_type == ENCAP_TYPE_IPPROTO; * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) * @transport_header: Transport layer header * @network_header: Network layer header * @mac_header: Link layer header * @kcov_handle: KCOV remote handle for remote coverage collection * @tail: Tail pointer * @end: End pointer * @head: Head of buffer * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { union { struct { /* These two members must be first to match sk_buff_head. */ struct sk_buff *next; struct sk_buff *prev; union { struct net_device *dev; /* Some protocols might use this space to store information, * while device pointer would be NULL. * UDP receive path is one user. */ unsigned long dev_scratch; }; }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; struct llist_node ll_node; }; union { struct sock *sk; int ip_defrag_offset; }; union { ktime_t tstamp; u64 skb_mstamp_ns; /* earliest departure time */ }; /* * This is the control buffer. It is free to use for every * layer. Please put your private variables there. If you * want to keep them across layers you have to do a skb_clone() * first. This is owned by whoever has the skb queued ATM. */ char cb[48] __aligned(8); union { struct { unsigned long _skb_refdst; void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; #ifdef CONFIG_NET_SOCK_MSG unsigned long _sk_redir; #endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif unsigned int len, data_len; __u16 mac_len, hdr_len; /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define CLONED_MASK (1 << 7) #else #define CLONED_MASK 1 #endif #define CLONED_OFFSET offsetof(struct sk_buff, __cloned_offset) /* private: */ __u8 __cloned_offset[0]; /* public: */ __u8 cloned:1, nohdr:1, fclone:2, peeked:1, head_frag:1, pfmemalloc:1, pp_recycle:1; /* page_pool recycle indicator */ #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; #endif /* Fields enclosed in headers group are copied * using a single memcpy() in __copy_skb_header() */ struct_group(headers, /* private: */ __u8 __pkt_type_offset[0]; /* public: */ __u8 pkt_type:3; /* see PKT_TYPE_MAX */ __u8 ignore_df:1; __u8 dst_pending_confirm:1; __u8 ip_summed:2; __u8 ooo_okay:1; /* private: */ __u8 __mono_tc_offset[0]; /* public: */ __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ #ifdef CONFIG_NET_XGRESS __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ __u8 tc_skip_classify:1; #endif __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 inner_protocol_type:1; __u8 l4_hash:1; __u8 sw_hash:1; #ifdef CONFIG_WIRELESS __u8 wifi_acked_valid:1; __u8 wifi_acked:1; #endif __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif #if IS_ENABLED(CONFIG_IP_VS) __u8 ipvs_property:1; #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) __u8 nf_trace:1; #endif #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; #endif __u8 redirected:1; #ifdef CONFIG_NET_REDIRECT __u8 from_ingress:1; #endif #ifdef CONFIG_NETFILTER_SKIP_EGRESS __u8 nf_skip_egress:1; #endif #ifdef CONFIG_TLS_DEVICE __u8 decrypted:1; #endif __u8 slow_gro:1; #if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; #endif #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif u16 alloc_cpu; union { __wsum csum; struct { __u16 csum_start; __u16 csum_offset; }; }; __u32 priority; int skb_iif; __u32 hash; union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; unsigned int sender_cpu; }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif union { __u32 mark; __u32 reserved_tailroom; }; union { __be16 inner_protocol; __u8 inner_ipproto; }; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; __be16 protocol; __u16 transport_header; __u16 network_header; __u16 mac_header; #ifdef CONFIG_KCOV u64 kcov_handle; #endif ); /* end headers group */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; sk_buff_data_t end; unsigned char *head, *data; unsigned int truesize; refcount_t users; #ifdef CONFIG_SKB_EXTENSIONS /* only useable after checking ->active_extensions != 0 */ struct skb_ext *extensions; #endif }; /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_TYPE_MAX (7 << 5) #else #define PKT_TYPE_MAX 7 #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) /* if you move tc_at_ingress or mono_delivery_time * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD #define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) #define TC_AT_INGRESS_MASK (1 << 6) #else #define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) #define TC_AT_INGRESS_MASK (1 << 1) #endif #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) #ifdef __KERNEL__ /* * Handling routines are only of interest to the kernel */ #define SKB_ALLOC_FCLONE 0x01 #define SKB_ALLOC_RX 0x02 #define SKB_ALLOC_NAPI 0x04 /** * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves * @skb: buffer */ static inline bool skb_pfmemalloc(const struct sk_buff *skb) { return unlikely(skb->pfmemalloc); } /* * skb might have a dst pointer attached, refcounted or not. * _skb_refdst low order bit is set if refcount was _not_ taken */ #define SKB_DST_NOREF 1UL #define SKB_DST_PTRMASK ~(SKB_DST_NOREF) /** * skb_dst - returns skb dst_entry * @skb: buffer * * Returns skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { /* If refdst was not refcounted, check we still are in a * rcu_read_lock section */ WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) && !rcu_read_lock_held() && !rcu_read_lock_bh_held()); return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } /** * skb_dst_set - sets skb dst * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was taken on dst and should * be released by skb_dst_drop() */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst; } /** * skb_dst_set_noref - sets skb dst, hopefully, without taking reference * @skb: buffer * @dst: dst entry * * Sets skb dst, assuming a reference was not taken on dst. * If dst entry is cached, we do not take reference and dst_release * will be avoided by refdst_drop. If dst entry is not cached, we take * reference, so that last dst_release can destroy the dst immediately. */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; } /** * skb_dst_is_noref - Test if skb dst isn't refcounted * @skb: buffer */ static inline bool skb_dst_is_noref(const struct sk_buff *skb) { return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb); } /** * skb_rtable - Returns the skb &rtable * @skb: buffer */ static inline struct rtable *skb_rtable(const struct sk_buff *skb) { return (struct rtable *)skb_dst(skb); } /* For mangling skb->pkt_type from user space side from applications * such as nft, tc, etc, we only allow a conservative subset of * possible pkt_types to be set. */ static inline bool skb_pkt_type_ok(u32 ptype) { return ptype <= PACKET_OTHERHOST; } /** * skb_napi_id - Returns the skb's NAPI id * @skb: buffer */ static inline unsigned int skb_napi_id(const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL return skb->napi_id; #else return 0; #endif } static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) { #ifdef CONFIG_WIRELESS return skb->wifi_acked_valid; #else return 0; #endif } /** * skb_unref - decrement the skb's reference count * @skb: buffer * * Returns true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { if (unlikely(!skb)) return false; if (likely(refcount_read(&skb->users) == 1)) smp_rmb(); else if (likely(!refcount_dec_and_test(&skb->users))) return false; return true; } void __fix_address kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason * @skb: buffer to free */ static inline void kfree_skb(struct sk_buff *skb) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } void skb_release_head_state(struct sk_buff *skb); void kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); static inline void kfree_skb_list(struct sk_buff *segs) { kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED); } #ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); #else static inline void consume_skb(struct sk_buff *skb) { return kfree_skb(skb); } #endif void __consume_stateless_skb(struct sk_buff *skb); void __kfree_skb(struct sk_buff *skb); extern struct kmem_cache *skbuff_cache; void kfree_skb_partial(struct sk_buff *skb, bool head_stolen); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize); struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, 0, NUMA_NO_NODE); } struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, int max_page_order, int *errcode, gfp_t gfp_mask); struct sk_buff *alloc_skb_for_msg(struct sk_buff *first); /* Layout of fast clones : [skb1][skb2][fclone_ref] */ struct sk_buff_fclones { struct sk_buff skb1; struct sk_buff skb2; refcount_t fclone_ref; }; /** * skb_fclone_busy - check if fclone is busy * @sk: socket * @skb: buffer * * Returns true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ static inline bool skb_fclone_busy(const struct sock *sk, const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) > 1 && READ_ONCE(fclones->skb2.sk) == sk; } /** * alloc_skb_fclone - allocate a network buffer from fclone cache * @size: size to allocate * @priority: allocation mask * * This function is a convenient wrapper around __alloc_skb(). */ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority); struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, gfp_t gfp_mask, bool fclone); static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, headroom, gfp_mask, false); } int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask); struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom); struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom, int newtailroom, gfp_t priority); int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error); /** * skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * * May return error in out of memory cases. The skb is freed on error. */ static inline int skb_pad(struct sk_buff *skb, int pad) { return __skb_pad(skb, pad, true); } #define dev_kfree_skb(a) consume_skb(a) int skb_append_pagefrags(struct sk_buff *skb, struct page *page, int offset, size_t size, size_t max_frags); struct skb_seq_state { __u32 lower_offset; __u32 upper_offset; __u32 frag_idx; __u32 stepped_offset; struct sk_buff *root_skb; struct sk_buff *cur_skb; __u8 *frag_data; __u32 frag_off; }; void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int to, struct skb_seq_state *st); unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st); void skb_abort_seq_read(struct skb_seq_state *st); unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config); /* * Packet hash types specify the type of hash in skb_set_hash. * * Hash types refer to the protocol layer addresses which are used to * construct a packet's hash. The hashes are used to differentiate or identify * flows of the protocol layer for the hash type. Hash types are either * layer-2 (L2), layer-3 (L3), or layer-4 (L4). * * Properties of hashes: * * 1) Two packets in different flows have different hash values * 2) Two packets in the same flow should have the same hash value * * A hash at a higher layer is considered to be more specific. A driver should * set the most specific hash possible. * * A driver cannot indicate a more specific hash than the layer at which a hash * was computed. For instance an L3 hash cannot be set as an L4 hash. * * A driver may indicate a hash level which is less specific than the * actual layer the hash was computed on. For instance, a hash computed * at L4 may be considered an L3 hash. This should only be done if the * driver can't unambiguously determine that the HW computed the hash at * the higher layer. Note that the "should" in the second property above * permits this. */ enum pkt_hash_types { PKT_HASH_TYPE_NONE, /* Undefined type */ PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */ PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */ PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ }; static inline void skb_clear_hash(struct sk_buff *skb) { skb->hash = 0; skb->sw_hash = 0; skb->l4_hash = 0; } static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) { if (!skb->l4_hash) skb_clear_hash(skb); } static inline void __skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4) { skb->l4_hash = is_l4; skb->sw_hash = is_sw; skb->hash = hash; } static inline void skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) { /* Used by drivers to set hash from HW */ __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4); } static inline void __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) { __skb_set_hash(skb, hash, true, is_l4); } void __skb_get_hash(struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen_proto); static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) { return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); struct bpf_flow_dissector; u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, unsigned int flags) { return __skb_flow_dissect(NULL, skb, flow_dissector, target_container, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, struct flow_keys *flow, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(NULL, skb, &flow_keys_dissector, flow, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys_basic(const struct net *net, const struct sk_buff *skb, struct flow_keys_basic *flow, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); /* Gets a skb connection tracking info, ctinfo map should be a * map of mapsize to translate enum ip_conntrack_info states * to user states. */ void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container); static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) __skb_get_hash(skb); return skb->hash; } static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; __u32 hash = __get_hash_from_flowi6(fl6, &keys); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } return skb->hash; } __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb); static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) { return skb->hash; } static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) { to->hash = from->hash; to->sw_hash = from->sw_hash; to->l4_hash = from->l4_hash; }; static inline int skb_cmp_decrypted(const struct sk_buff *skb1, const struct sk_buff *skb2) { #ifdef CONFIG_TLS_DEVICE return skb2->decrypted - skb1->decrypted; #else return 0; #endif } static inline void skb_copy_decrypted(struct sk_buff *to, const struct sk_buff *from) { #ifdef CONFIG_TLS_DEVICE to->decrypted = from->decrypted; #endif } #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->head + skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = offset; } #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { return skb->end; } static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) { skb->end = skb->head + offset; } #endif struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success); int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) { return &skb_shinfo(skb)->hwtstamps; } static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) { bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE; return is_zcopy ? skb_uarg(skb) : NULL; } static inline bool skb_zcopy_pure(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; } static inline bool skb_zcopy_managed(const struct sk_buff *skb) { return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS; } static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, const struct sk_buff *skb2) { return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2); } static inline void net_zcopy_get(struct ubuf_info *uarg) { refcount_inc(&uarg->refcnt); } static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg) { skb_shinfo(skb)->destructor_arg = uarg; skb_shinfo(skb)->flags |= uarg->flags; } static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg, bool *have_ref) { if (skb && uarg && !skb_zcopy(skb)) { if (unlikely(have_ref && *have_ref)) *have_ref = false; else net_zcopy_get(uarg); skb_zcopy_init(skb, uarg); } } static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val) { skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL); skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG; } static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb) { return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL; } static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) { return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL); } static inline void net_zcopy_put(struct ubuf_info *uarg) { if (uarg) uarg->callback(NULL, uarg, true); } static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { if (uarg->callback == msg_zerocopy_callback) msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref) net_zcopy_put(uarg); } } /* Release a reference on a zerocopy structure */ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) { struct ubuf_info *uarg = skb_zcopy(skb); if (uarg) { if (!skb_zcopy_is_nouarg(skb)) uarg->callback(skb, uarg, zerocopy_success); skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } } void __skb_zcopy_downgrade_managed(struct sk_buff *skb); static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) { if (unlikely(skb_zcopy_managed(skb))) __skb_zcopy_downgrade_managed(skb); } static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; } static inline void skb_poison_list(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET skb->next = SKB_LIST_POISON_NEXT; #endif } /* Iterate through singly-linked GSO fragments of an skb. */ #define skb_list_walk_safe(first, skb, next_skb) \ for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb); \ (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL) static inline void skb_list_del_init(struct sk_buff *skb) { __list_del_entry(&skb->list); skb_mark_not_on_list(skb); } /** * skb_queue_empty - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. */ static inline int skb_queue_empty(const struct sk_buff_head *list) { return list->next == (const struct sk_buff *) list; } /** * skb_queue_empty_lockless - check if a queue is empty * @list: queue head * * Returns true if the queue is empty, false otherwise. * This variant can be used in lockless contexts. */ static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list) { return READ_ONCE(list->next) == (const struct sk_buff *) list; } /** * skb_queue_is_last - check if skb is the last entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the last buffer on the list. */ static inline bool skb_queue_is_last(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->next == (const struct sk_buff *) list; } /** * skb_queue_is_first - check if skb is the first entry in the queue * @list: queue head * @skb: buffer * * Returns true if @skb is the first buffer on the list. */ static inline bool skb_queue_is_first(const struct sk_buff_head *list, const struct sk_buff *skb) { return skb->prev == (const struct sk_buff *) list; } /** * skb_queue_next - return the next packet in the queue * @list: queue head * @skb: current buffer * * Return the next packet in @list after @skb. It is only valid to * call this if skb_queue_is_last() evaluates to false. */ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_last(list, skb)); return skb->next; } /** * skb_queue_prev - return the prev packet in the queue * @list: queue head * @skb: current buffer * * Return the prev packet in @list before @skb. It is only valid to * call this if skb_queue_is_first() evaluates to false. */ static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list, const struct sk_buff *skb) { /* This BUG_ON may seem severe, but if we just return then we * are going to dereference garbage. */ BUG_ON(skb_queue_is_first(list, skb)); return skb->prev; } /** * skb_get - reference buffer * @skb: buffer to reference * * Makes another reference to a socket buffer and returns a pointer * to the buffer. */ static inline struct sk_buff *skb_get(struct sk_buff *skb) { refcount_inc(&skb->users); return skb; } /* * If users == 1, we are the only owner and can avoid redundant atomic changes. */ /** * skb_cloned - is the buffer a clone * @skb: buffer to check * * Returns true if the buffer was generated with skb_clone() and is * one of multiple shared copies of the buffer. Cloned buffers are * shared data so must not be written to under normal circumstances. */ static inline int skb_cloned(const struct sk_buff *skb) { return skb->cloned && (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1; } static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /* This variant of skb_unclone() makes sure skb->truesize * and skb_end_offset() are not changed, whenever a new skb->head is needed. * * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) * when various debugging features are in place. */ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) return __skb_unclone_keeptruesize(skb, pri); return 0; } /** * skb_header_cloned - is the header a clone * @skb: buffer to check * * Returns true if modifying the header part of the buffer requires * the data to be copied. */ static inline int skb_header_cloned(const struct sk_buff *skb) { int dataref; if (!skb->cloned) return 0; dataref = atomic_read(&skb_shinfo(skb)->dataref); dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT); return dataref != 1; } static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_header_cloned(skb)) return pskb_expand_head(skb, 0, 0, pri); return 0; } /** * __skb_header_release() - allow clones to use the headroom * @skb: buffer to operate on * * See "DOC: dataref and headerless skbs". */ static inline void __skb_header_release(struct sk_buff *skb) { skb->nohdr = 1; atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT)); } /** * skb_shared - is the buffer shared * @skb: buffer to check * * Returns true if more than one person has a reference to this * buffer. */ static inline int skb_shared(const struct sk_buff *skb) { return refcount_read(&skb->users) != 1; } /** * skb_share_check - check if buffer is shared and if so clone it * @skb: buffer to check * @pri: priority for memory allocation * * If the buffer is shared the buffer is cloned and the old copy * drops a reference. A new clone with a single reference is returned. * If the buffer is not shared the original buffer is returned. When * being called from interrupt status or with spinlocks held pri must * be GFP_ATOMIC. * * NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /* * Copy shared buffers into a new sk_buff. We effectively do COW on * packets to handle cases where we have a local reader and forward * and a couple of other messy ones. The normal one is tcpdumping * a packet that's being forwarded. */ /** * skb_unshare - make a copy of a shared buffer * @skb: buffer to check * @pri: priority for memory allocation * * If the socket buffer is a clone then this function creates a new * copy of the data, drops a reference count on the old copy and returns * the new copy with the reference count at 1. If the buffer is not a clone * the original buffer is returned. When called with a spinlock held or * from interrupt state @pri must be %GFP_ATOMIC * * %NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_unshare(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, pri); /* Free our shared copy */ if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; } /** * skb_peek - peek at the head of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the head element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_) { struct sk_buff *skb = list_->next; if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * __skb_peek - peek at the head of a non-empty &sk_buff_head * @list_: list to peek at * * Like skb_peek(), but the caller knows that the list is not empty. */ static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_) { return list_->next; } /** * skb_peek_next - peek skb following the given one from a queue * @skb: skb to start from * @list_: list to peek at * * Returns %NULL when the end of the list is met or a pointer to the * next element. The reference count is not incremented and the * reference is therefore volatile. Use with caution. */ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb, const struct sk_buff_head *list_) { struct sk_buff *next = skb->next; if (next == (struct sk_buff *)list_) next = NULL; return next; } /** * skb_peek_tail - peek at the tail of an &sk_buff_head * @list_: list to peek at * * Peek an &sk_buff. Unlike most other operations you _MUST_ * be careful with this one. A peek leaves the buffer on the * list and someone else may run off with it. You must hold * the appropriate locks or have a private queue to do this. * * Returns %NULL for an empty list or a pointer to the tail element. * The reference count is not incremented and the reference is therefore * volatile. Use with caution. */ static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_) { struct sk_buff *skb = READ_ONCE(list_->prev); if (skb == (struct sk_buff *)list_) skb = NULL; return skb; } /** * skb_queue_len - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. */ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) { return list_->qlen; } /** * skb_queue_len_lockless - get queue length * @list_: list to measure * * Return the length of an &sk_buff queue. * This variant can be used in lockless contexts. */ static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) { return READ_ONCE(list_->qlen); } /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize * * This initializes only the list and queue length aspects of * an sk_buff_head object. This allows to initialize the list * aspects of an sk_buff_head without reinitializing things like * the spinlock. It can also be used for on-stack sk_buff_head * objects where the spinlock is known to not be used. */ static inline void __skb_queue_head_init(struct sk_buff_head *list) { list->prev = list->next = (struct sk_buff *)list; list->qlen = 0; } /* * This function creates a split out lock class for each invocation; * this is needed for now since a whole lot of users of the skb-queue * infrastructure in drivers have different locking usage (in hardirq) * than the networking core (in softirq only). In the long run either the * network layer or drivers should need annotation to consolidate the * main types of usage into 3 classes. */ static inline void skb_queue_head_init(struct sk_buff_head *list) { spin_lock_init(&list->lock); __skb_queue_head_init(list); } static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { skb_queue_head_init(list); lockdep_set_class(&list->lock, class); } /* * Insert an sk_buff on a list. * * The "__skb_xxxx()" functions are the non-atomic ones that * can only be called with interrupts disabled. */ static inline void __skb_insert(struct sk_buff *newsk, struct sk_buff *prev, struct sk_buff *next, struct sk_buff_head *list) { /* See skb_queue_empty_lockless() and skb_peek_tail() * for the opposite READ_ONCE() */ WRITE_ONCE(newsk->next, next); WRITE_ONCE(newsk->prev, prev); WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk); WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk); WRITE_ONCE(list->qlen, list->qlen + 1); } static inline void __skb_queue_splice(const struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *next) { struct sk_buff *first = list->next; struct sk_buff *last = list->prev; WRITE_ONCE(first->prev, prev); WRITE_ONCE(prev->next, first); WRITE_ONCE(last->next, next); WRITE_ONCE(next->prev, last); } /** * skb_queue_splice - join two skb lists, this is designed for stacks * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; } } /** * skb_queue_splice_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * The list at @list is reinitialised */ static inline void skb_queue_splice_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, (struct sk_buff *) head, head->next); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * skb_queue_splice_tail - join two skb lists, each list being a queue * @list: the new list to add * @head: the place to add it in the first list */ static inline void skb_queue_splice_tail(const struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; } } /** * skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list * @list: the new list to add * @head: the place to add it in the first list * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { if (!skb_queue_empty(list)) { __skb_queue_splice(list, head->prev, (struct sk_buff *) head); head->qlen += list->qlen; __skb_queue_head_init(list); } } /** * __skb_queue_after - queue a buffer at the list head * @list: list to use * @prev: place after this buffer * @newsk: buffer to queue * * Queue a buffer int the middle of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_after(struct sk_buff_head *list, struct sk_buff *prev, struct sk_buff *newsk) { __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list); } void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list); static inline void __skb_queue_before(struct sk_buff_head *list, struct sk_buff *next, struct sk_buff *newsk) { __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list); } /** * __skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the start of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_after(list, (struct sk_buff *)list, newsk); } void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); /** * __skb_queue_tail - queue a buffer at the list tail * @list: list to use * @newsk: buffer to queue * * Queue a buffer at the end of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) { __skb_queue_before(list, (struct sk_buff *)list, newsk); } void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); /* * remove sk_buff from list. _Must_ be called atomically, and with * the list known.. */ void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list); static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev; WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; WRITE_ONCE(next->prev, prev); WRITE_ONCE(prev->next, next); } /** * __skb_dequeue - remove from the head of the queue * @list: list to dequeue from * * Remove the head of the list. This function does not take any locks * so must be used with appropriate locks held only. The head item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue(struct sk_buff_head *list); /** * __skb_dequeue_tail - remove from the tail of the queue * @list: list to dequeue from * * Remove the tail of the list. This function does not take any locks * so must be used with appropriate locks held only. The tail item is * returned or %NULL if the list is empty. */ static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) { struct sk_buff *skb = skb_peek_tail(list); if (skb) __skb_unlink(skb, list); return skb; } struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); static inline bool skb_is_nonlinear(const struct sk_buff *skb) { return skb->data_len; } static inline unsigned int skb_headlen(const struct sk_buff *skb) { return skb->len - skb->data_len; } static inline unsigned int __skb_pagelen(const struct sk_buff *skb) { unsigned int i, len = 0; for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--) len += skb_frag_size(&skb_shinfo(skb)->frags[i]); return len; } static inline unsigned int skb_pagelen(const struct sk_buff *skb) { return skb_headlen(skb) + __skb_pagelen(skb); } static inline void skb_frag_fill_page_desc(skb_frag_t *frag, struct page *page, int off, int size) { frag->bv_page = page; frag->bv_offset = off; skb_frag_size_set(frag, size); } static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, int i, struct page *page, int off, int size) { skb_frag_t *frag = &shinfo->frags[i]; skb_frag_fill_page_desc(frag, page, off, size); } /** * skb_len_add - adds a number to len fields of skb * @skb: buffer to add len to * @delta: number of bytes to add */ static inline void skb_len_add(struct sk_buff *skb, int delta) { skb->len += delta; skb->data_len += delta; skb->truesize += delta; } /** * __skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * Initialises the @i'th fragment of @skb to point to &size bytes at * offset @off within @page. * * Does not take any additional reference on the fragment. */ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); /* Propagate page pfmemalloc to the skb if we can. The problem is * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm). */ page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } /** * skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * As per __skb_fill_page_desc() -- initialises the @i'th fragment of * @skb to point to @size bytes at offset @off within @page. In * addition updates @skb such that @i is the last fragment. * * Does not take any additional reference on the fragment. */ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { __skb_fill_page_desc(skb, i, page, off, size); skb_shinfo(skb)->nr_frags = i + 1; } /** * skb_fill_page_desc_noacc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised * @i: paged fragment index to initialise * @page: the page to use for this fragment * @off: the offset to the data with @page * @size: the length of the data * * Variant of skb_fill_page_desc() which does not deal with * pfmemalloc, if page is not owned by us. */ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, struct page *page, int off, int size) { struct skb_shared_info *shinfo = skb_shinfo(skb); __skb_fill_page_desc_noacc(shinfo, i, page, off, size); shinfo->nr_frags = i + 1; } void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize); #define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb)) #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { return skb->head + skb->tail; } static inline void skb_reset_tail_pointer(struct sk_buff *skb) { skb->tail = skb->data - skb->head; } static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) { skb_reset_tail_pointer(skb); skb->tail += offset; } #else /* NET_SKBUFF_DATA_USES_OFFSET */ static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { return skb->tail; } static inline void skb_reset_tail_pointer(struct sk_buff *skb) { skb->tail = skb->data; } static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) { skb->tail = skb->data + offset; } #endif /* NET_SKBUFF_DATA_USES_OFFSET */ static inline void skb_assert_len(struct sk_buff *skb) { #ifdef CONFIG_DEBUG_NET if (WARN_ONCE(!skb->len, "%s\n", __func__)) DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); #endif /* CONFIG_DEBUG_NET */ } /* * Add data to an sk_buff */ void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len); void *skb_put(struct sk_buff *skb, unsigned int len); static inline void *__skb_put(struct sk_buff *skb, unsigned int len) { void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; return tmp; } static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len) { void *tmp = __skb_put(skb, len); memset(tmp, 0, len); return tmp; } static inline void *__skb_put_data(struct sk_buff *skb, const void *data, unsigned int len) { void *tmp = __skb_put(skb, len); memcpy(tmp, data, len); return tmp; } static inline void __skb_put_u8(struct sk_buff *skb, u8 val) { *(u8 *)__skb_put(skb, 1) = val; } static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len) { void *tmp = skb_put(skb, len); memset(tmp, 0, len); return tmp; } static inline void *skb_put_data(struct sk_buff *skb, const void *data, unsigned int len) { void *tmp = skb_put(skb, len); memcpy(tmp, data, len); return tmp; } static inline void skb_put_u8(struct sk_buff *skb, u8 val) { *(u8 *)skb_put(skb, 1) = val; } void *skb_push(struct sk_buff *skb, unsigned int len); static inline void *__skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; return skb->data; } void *skb_pull(struct sk_buff *skb, unsigned int len); static inline void *__skb_pull(struct sk_buff *skb, unsigned int len) { skb->len -= len; if (unlikely(skb->len < skb->data_len)) { #if defined(CONFIG_DEBUG_NET) skb->len += len; pr_err("__skb_pull(len=%u)\n", len); skb_dump(KERN_ERR, skb, false); #endif BUG(); } return skb->data += len; } static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len) { return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); } void *skb_pull_data(struct sk_buff *skb, size_t len); void *__pskb_pull_tail(struct sk_buff *skb, int delta); static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; if (unlikely(len > skb->len)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (unlikely(!__pskb_pull_tail(skb, len - skb_headlen(skb)))) return SKB_DROP_REASON_NOMEM; return SKB_NOT_DROPPED_YET; } static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET; } static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) { if (!pskb_may_pull(skb, len)) return NULL; skb->len -= len; return skb->data += len; } void skb_condense(struct sk_buff *skb); /** * skb_headroom - bytes at buffer head * @skb: buffer to check * * Return the number of bytes of free space at the head of an &sk_buff. */ static inline unsigned int skb_headroom(const struct sk_buff *skb) { return skb->data - skb->head; } /** * skb_tailroom - bytes at buffer end * @skb: buffer to check * * Return the number of bytes of free space at the tail of an sk_buff */ static inline int skb_tailroom(const struct sk_buff *skb) { return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail; } /** * skb_availroom - bytes at buffer end * @skb: buffer to check * * Return the number of bytes of free space at the tail of an sk_buff * allocated by sk_stream_alloc() */ static inline int skb_availroom(const struct sk_buff *skb) { if (skb_is_nonlinear(skb)) return 0; return skb->end - skb->tail - skb->reserved_tailroom; } /** * skb_reserve - adjust headroom * @skb: buffer to alter * @len: bytes to move * * Increase the headroom of an empty &sk_buff by reducing the tail * room. This is only allowed for an empty buffer. */ static inline void skb_reserve(struct sk_buff *skb, int len) { skb->data += len; skb->tail += len; } /** * skb_tailroom_reserve - adjust reserved_tailroom * @skb: buffer to alter * @mtu: maximum amount of headlen permitted * @needed_tailroom: minimum amount of reserved_tailroom * * Set reserved_tailroom so that headlen can be as large as possible but * not larger than mtu and tailroom cannot be smaller than * needed_tailroom. * The required headroom should already have been reserved before using * this function. */ static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu, unsigned int needed_tailroom) { SKB_LINEAR_ASSERT(skb); if (mtu < skb_tailroom(skb) - needed_tailroom) /* use at most mtu */ skb->reserved_tailroom = skb_tailroom(skb) - mtu; else /* use up to all available space */ skb->reserved_tailroom = needed_tailroom; } #define ENCAP_TYPE_ETHER 0 #define ENCAP_TYPE_IPPROTO 1 static inline void skb_set_inner_protocol(struct sk_buff *skb, __be16 protocol) { skb->inner_protocol = protocol; skb->inner_protocol_type = ENCAP_TYPE_ETHER; } static inline void skb_set_inner_ipproto(struct sk_buff *skb, __u8 ipproto) { skb->inner_ipproto = ipproto; skb->inner_protocol_type = ENCAP_TYPE_IPPROTO; } static inline void skb_reset_inner_headers(struct sk_buff *skb) { skb->inner_mac_header = skb->mac_header; skb->inner_network_header = skb->network_header; skb->inner_transport_header = skb->transport_header; } static inline void skb_reset_mac_len(struct sk_buff *skb) { skb->mac_len = skb->network_header - skb->mac_header; } static inline unsigned char *skb_inner_transport_header(const struct sk_buff *skb) { return skb->head + skb->inner_transport_header; } static inline int skb_inner_transport_offset(const struct sk_buff *skb) { return skb_inner_transport_header(skb) - skb->data; } static inline void skb_reset_inner_transport_header(struct sk_buff *skb) { skb->inner_transport_header = skb->data - skb->head; } static inline void skb_set_inner_transport_header(struct sk_buff *skb, const int offset) { skb_reset_inner_transport_header(skb); skb->inner_transport_header += offset; } static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb) { return skb->head + skb->inner_network_header; } static inline void skb_reset_inner_network_header(struct sk_buff *skb) { skb->inner_network_header = skb->data - skb->head; } static inline void skb_set_inner_network_header(struct sk_buff *skb, const int offset) { skb_reset_inner_network_header(skb); skb->inner_network_header += offset; } static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb) { return skb->head + skb->inner_mac_header; } static inline void skb_reset_inner_mac_header(struct sk_buff *skb) { skb->inner_mac_header = skb->data - skb->head; } static inline void skb_set_inner_mac_header(struct sk_buff *skb, const int offset) { skb_reset_inner_mac_header(skb); skb->inner_mac_header += offset; } static inline bool skb_transport_header_was_set(const struct sk_buff *skb) { return skb->transport_header != (typeof(skb->transport_header))~0U; } static inline unsigned char *skb_transport_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)); return skb->head + skb->transport_header; } static inline void skb_reset_transport_header(struct sk_buff *skb) { skb->transport_header = skb->data - skb->head; } static inline void skb_set_transport_header(struct sk_buff *skb, const int offset) { skb_reset_transport_header(skb); skb->transport_header += offset; } static inline unsigned char *skb_network_header(const struct sk_buff *skb) { return skb->head + skb->network_header; } static inline void skb_reset_network_header(struct sk_buff *skb) { skb->network_header = skb->data - skb->head; } static inline void skb_set_network_header(struct sk_buff *skb, const int offset) { skb_reset_network_header(skb); skb->network_header += offset; } static inline int skb_mac_header_was_set(const struct sk_buff *skb) { return skb->mac_header != (typeof(skb->mac_header))~0U; } static inline unsigned char *skb_mac_header(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); return skb->head + skb->mac_header; } static inline int skb_mac_offset(const struct sk_buff *skb) { return skb_mac_header(skb) - skb->data; } static inline u32 skb_mac_header_len(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb)); return skb->network_header - skb->mac_header; } static inline void skb_unset_mac_header(struct sk_buff *skb) { skb->mac_header = (typeof(skb->mac_header))~0U; } static inline void skb_reset_mac_header(struct sk_buff *skb) { skb->mac_header = skb->data - skb->head; } static inline void skb_set_mac_header(struct sk_buff *skb, const int offset) { skb_reset_mac_header(skb); skb->mac_header += offset; } static inline void skb_pop_mac_header(struct sk_buff *skb) { skb->mac_header = skb->network_header; } static inline void skb_probe_transport_header(struct sk_buff *skb) { struct flow_keys_basic keys; if (skb_transport_header_was_set(skb)) return; if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); } static inline void skb_mac_header_rebuild(struct sk_buff *skb) { if (skb_mac_header_was_set(skb)) { const unsigned char *old_mac = skb_mac_header(skb); skb_set_mac_header(skb, -skb->mac_len); memmove(skb_mac_header(skb), old_mac, skb->mac_len); } } static inline int skb_checksum_start_offset(const struct sk_buff *skb) { return skb->csum_start - skb_headroom(skb); } static inline unsigned char *skb_checksum_start(const struct sk_buff *skb) { return skb->head + skb->csum_start; } static inline int skb_transport_offset(const struct sk_buff *skb) { return skb_transport_header(skb) - skb->data; } static inline u32 skb_network_header_len(const struct sk_buff *skb) { return skb->transport_header - skb->network_header; } static inline u32 skb_inner_network_header_len(const struct sk_buff *skb) { return skb->inner_transport_header - skb->inner_network_header; } static inline int skb_network_offset(const struct sk_buff *skb) { return skb_network_header(skb) - skb->data; } static inline int skb_inner_network_offset(const struct sk_buff *skb) { return skb_inner_network_header(skb) - skb->data; } static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) { return pskb_may_pull(skb, skb_network_offset(skb) + len); } /* * CPUs often take a performance hit when accessing unaligned memory * locations. The actual performance hit varies, it can be small if the * hardware handles it or large if we have to take an exception and fix it * in software. * * Since an ethernet header is 14 bytes network drivers often end up with * the IP header at an unaligned offset. The IP header can be aligned by * shifting the start of the packet by 2 bytes. Drivers should do this * with: * * skb_reserve(skb, NET_IP_ALIGN); * * The downside to this alignment of the IP header is that the DMA is now * unaligned. On some architectures the cost of an unaligned DMA is high * and this cost outweighs the gains made by aligning the IP header. * * Since this trade off varies between architectures, we allow NET_IP_ALIGN * to be overridden. */ #ifndef NET_IP_ALIGN #define NET_IP_ALIGN 2 #endif /* * The networking layer reserves some headroom in skb data (via * dev_alloc_skb). This is used to avoid having to reallocate skb data when * the header has to grow. In the default case, if the header has to grow * 32 bytes or less we avoid the reallocation. * * Unfortunately this headroom changes the DMA alignment of the resulting * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive * on some architectures. An architecture can override this value, * perhaps setting it to a cacheline in size (since that will maintain * cacheline alignment of the DMA). It must be a power of 2. * * Various parts of the networking layer expect at least 32 bytes of * headroom, you should not reduce this. * * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS) * to reduce average number of cache lines per packet. * get_rps_cpu() for example only access one 64 bytes aligned block : * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8) */ #ifndef NET_SKB_PAD #define NET_SKB_PAD max(32, L1_CACHE_BYTES) #endif int ___pskb_trim(struct sk_buff *skb, unsigned int len); static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) { if (WARN_ON(skb_is_nonlinear(skb))) return; skb->len = len; skb_set_tail_pointer(skb, len); } static inline void __skb_trim(struct sk_buff *skb, unsigned int len) { __skb_set_length(skb, len); } void skb_trim(struct sk_buff *skb, unsigned int len); static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) { if (skb->data_len) return ___pskb_trim(skb, len); __skb_trim(skb, len); return 0; } static inline int pskb_trim(struct sk_buff *skb, unsigned int len) { return (len < skb->len) ? __pskb_trim(skb, len) : 0; } /** * pskb_trim_unique - remove end from a paged unique (not cloned) buffer * @skb: buffer to alter * @len: new length * * This is identical to pskb_trim except that the caller knows that * the skb is not cloned so we should never get an error due to out- * of-memory. */ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) { int err = pskb_trim(skb, len); BUG_ON(err); } static inline int __skb_grow(struct sk_buff *skb, unsigned int len) { unsigned int diff = len - skb->len; if (skb_tailroom(skb) < diff) { int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb), GFP_ATOMIC); if (ret) return ret; } __skb_set_length(skb, len); return 0; } /** * skb_orphan - orphan a buffer * @skb: buffer to orphan * * If a buffer currently has an owner then we call the owner's * destructor function and make the @skb unowned. The buffer continues * to exist but is no longer charged to its former owner. */ static inline void skb_orphan(struct sk_buff *skb) { if (skb->destructor) { skb->destructor(skb); skb->destructor = NULL; skb->sk = NULL; } else { BUG_ON(skb->sk); } } /** * skb_orphan_frags - orphan the frags contained in a buffer * @skb: buffer to orphan frags from * @gfp_mask: allocation mask for replacement pages * * For each frag in the SKB which needs a destructor (i.e. has an * owner) create a copy of that frag and release the original * page by calling the destructor. */ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN) return 0; return skb_copy_ubufs(skb, gfp_mask); } /* Frags must be orphaned, even if refcounted, if skb might loop to rx path */ static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; return skb_copy_ubufs(skb, gfp_mask); } /** * __skb_queue_purge_reason - empty a list * @list: list to empty * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function does not take the * list lock and the caller must hold the relevant locks to use it. */ static inline void __skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason) { struct sk_buff *skb; while ((skb = __skb_dequeue(list)) != NULL) kfree_skb_reason(skb, reason); } static inline void __skb_queue_purge(struct sk_buff_head *list) { __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } void skb_queue_purge_reason(struct sk_buff_head *list, enum skb_drop_reason reason); static inline void skb_queue_purge(struct sk_buff_head *list) { skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE); } unsigned int skb_rbtree_purge(struct rb_root *root); void skb_errqueue_purge(struct sk_buff_head *list); void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); /** * netdev_alloc_frag - allocate a page fragment * @fragsz: fragment size * * Allocates a frag from a page for receive buffer. * Uses GFP_ATOMIC allocations. */ static inline void *netdev_alloc_frag(unsigned int fragsz) { return __netdev_alloc_frag_align(fragsz, ~0u); } static inline void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __netdev_alloc_frag_align(fragsz, -align); } struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask); /** * netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @length: length to allocate * * Allocate a new &sk_buff and assign it a usage count of one. The * buffer has unspecified headroom built in. Users should allocate * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * * %NULL is returned if there is no free memory. Although this function * allocates memory it can be called from an interrupt. */ static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev, unsigned int length) { return __netdev_alloc_skb(dev, length, GFP_ATOMIC); } /* legacy helper around __netdev_alloc_skb() */ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask) { return __netdev_alloc_skb(NULL, length, gfp_mask); } /* legacy helper around netdev_alloc_skb() */ static inline struct sk_buff *dev_alloc_skb(unsigned int length) { return netdev_alloc_skb(NULL, length); } static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev, unsigned int length, gfp_t gfp) { struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp); if (NET_IP_ALIGN && skb) skb_reserve(skb, NET_IP_ALIGN); return skb; } static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, unsigned int length) { return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); } static inline void skb_free_frag(void *addr) { page_frag_free(addr); } void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask); static inline void *napi_alloc_frag(unsigned int fragsz) { return __napi_alloc_frag_align(fragsz, ~0u); } static inline void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __napi_alloc_frag_align(fragsz, -align); } struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int length, gfp_t gfp_mask); static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length) { return __napi_alloc_skb(napi, length, GFP_ATOMIC); } void napi_consume_skb(struct sk_buff *skb, int budget); void napi_skb_free_stolen_head(struct sk_buff *skb); void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason); /** * __dev_alloc_pages - allocate page for network Rx * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * @order: size of the allocation * * Allocate a new page. * * %NULL is returned if there is no free memory. */ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, unsigned int order) { /* This piece of code contains several assumptions. * 1. This is for device Rx, therefor a cold page is preferred. * 2. The expectation is the user wants a compound page. * 3. If requesting a order 0 page it will not be compound * due to the check to see if order has a value in prep_new_page * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); } static inline struct page *dev_alloc_pages(unsigned int order) { return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order); } /** * __dev_alloc_page - allocate a page for network Rx * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx * * Allocate a new page. * * %NULL is returned if there is no free memory. */ static inline struct page *__dev_alloc_page(gfp_t gfp_mask) { return __dev_alloc_pages(gfp_mask, 0); } static inline struct page *dev_alloc_page(void) { return dev_alloc_pages(0); } /** * dev_page_is_reusable - check whether a page can be reused for network Rx * @page: the page to test * * A page shouldn't be considered for reusing/recycling if it was allocated * under memory pressure or at a distant memory node. * * Returns false if this page should be returned to page allocator, true * otherwise. */ static inline bool dev_page_is_reusable(const struct page *page) { return likely(page_to_nid(page) == numa_mem_id() && !page_is_pfmemalloc(page)); } /** * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page * @page: The page that was allocated from skb_alloc_page * @skb: The skb that may need pfmemalloc set */ static inline void skb_propagate_pfmemalloc(const struct page *page, struct sk_buff *skb) { if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } /** * skb_frag_off() - Returns the offset of a skb fragment * @frag: the paged fragment */ static inline unsigned int skb_frag_off(const skb_frag_t *frag) { return frag->bv_offset; } /** * skb_frag_off_add() - Increments the offset of a skb fragment by @delta * @frag: skb fragment * @delta: value to add */ static inline void skb_frag_off_add(skb_frag_t *frag, int delta) { frag->bv_offset += delta; } /** * skb_frag_off_set() - Sets the offset of a skb fragment * @frag: skb fragment * @offset: offset of fragment */ static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset) { frag->bv_offset = offset; } /** * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment * @fragto: skb fragment where offset is set * @fragfrom: skb fragment offset is copied from */ static inline void skb_frag_off_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { fragto->bv_offset = fragfrom->bv_offset; } /** * skb_frag_page - retrieve the page referred to by a paged fragment * @frag: the paged fragment * * Returns the &struct page associated with @frag. */ static inline struct page *skb_frag_page(const skb_frag_t *frag) { return frag->bv_page; } /** * __skb_frag_ref - take an addition reference on a paged fragment. * @frag: the paged fragment * * Takes an additional reference on the paged fragment @frag. */ static inline void __skb_frag_ref(skb_frag_t *frag) { get_page(skb_frag_page(frag)); } /** * skb_frag_ref - take an addition reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset. * * Takes an additional reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_ref(struct sk_buff *skb, int f) { __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } bool napi_pp_put_page(struct page *page, bool napi_safe); static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { struct page *page = skb_frag_page(frag); #ifdef CONFIG_PAGE_POOL if (recycle && napi_pp_put_page(page, napi_safe)) return; #endif put_page(page); } /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment * @recycle: recycle the page if allocated via page_pool * * Releases a reference on the paged fragment @frag * or recycles the page via the page_pool API. */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { napi_frag_unref(frag, recycle, false); } /** * skb_frag_unref - release a reference on a paged fragment of an skb. * @skb: the buffer * @f: the fragment offset * * Releases a reference on the @f'th paged fragment of @skb. */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (!skb_zcopy_managed(skb)) __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } /** * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * * Returns the address of the data within @frag. The page must already * be mapped. */ static inline void *skb_frag_address(const skb_frag_t *frag) { return page_address(skb_frag_page(frag)) + skb_frag_off(frag); } /** * skb_frag_address_safe - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * * Returns the address of the data within @frag. Checks that the page * is mapped and returns %NULL otherwise. */ static inline void *skb_frag_address_safe(const skb_frag_t *frag) { void *ptr = page_address(skb_frag_page(frag)); if (unlikely(!ptr)) return NULL; return ptr + skb_frag_off(frag); } /** * skb_frag_page_copy() - sets the page in a fragment from another fragment * @fragto: skb fragment where page is set * @fragfrom: skb fragment page is copied from */ static inline void skb_frag_page_copy(skb_frag_t *fragto, const skb_frag_t *fragfrom) { fragto->bv_page = fragfrom->bv_page; } bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** * skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the * fragment's own offset) * @size: the number of bytes to map * @dir: the direction of the mapping (``PCI_DMA_*``) * * Maps the page associated with @frag to @device. */ static inline dma_addr_t skb_frag_dma_map(struct device *dev, const skb_frag_t *frag, size_t offset, size_t size, enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { return __pskb_copy(skb, skb_headroom(skb), gfp_mask); } static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb, gfp_t gfp_mask) { return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true); } /** * skb_clone_writable - is the header of a clone writable * @skb: buffer to check * @len: length up to which to write * * Returns true if modifying the header part of the cloned buffer * does not requires the data to be copied. */ static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len) { return !skb_header_cloned(skb) && skb_headroom(skb) + len <= skb->hdr_len; } static inline int skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return skb_cloned(skb) && !skb_clone_writable(skb, write_len) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom, int cloned) { int delta = 0; if (headroom > skb_headroom(skb)) delta = headroom - skb_headroom(skb); if (delta || cloned) return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0, GFP_ATOMIC); return 0; } /** * skb_cow - copy header of skb when it is required * @skb: buffer to cow * @headroom: needed headroom * * If the skb passed lacks sufficient headroom or its data part * is shared, data is reallocated. If reallocation fails, an error * is returned and original skb is not changed. * * The result is skb with writable area skb->head...skb->tail * and at least @headroom of space at head. */ static inline int skb_cow(struct sk_buff *skb, unsigned int headroom) { return __skb_cow(skb, headroom, skb_cloned(skb)); } /** * skb_cow_head - skb_cow but only making the head writable * @skb: buffer to cow * @headroom: needed headroom * * This function is identical to skb_cow except that we replace the * skb_cloned check by skb_header_cloned. It should be used when * you only need to push on some header and do not need to modify * the data. */ static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom) { return __skb_cow(skb, headroom, skb_header_cloned(skb)); } /** * skb_padto - pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ static inline int skb_padto(struct sk_buff *skb, unsigned int len) { unsigned int size = skb->len; if (likely(size >= len)) return 0; return skb_pad(skb, len - size); } /** * __skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * @free_on_error: free buffer on error * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error if @free_on_error is true. */ static inline int __must_check __skb_put_padto(struct sk_buff *skb, unsigned int len, bool free_on_error) { unsigned int size = skb->len; if (unlikely(size < len)) { len -= size; if (__skb_pad(skb, len, free_on_error)) return -ENOMEM; __skb_put(skb, len); } return 0; } /** * skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad * @len: minimal length * * Pads up a buffer to ensure the trailing bytes exist and are * blanked. If the buffer already contains sufficient data it * is untouched. Otherwise it is extended. Returns zero on * success. The skb is freed on error. */ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len) { return __skb_put_padto(skb, len, true); } bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) __must_check; static inline int skb_add_data(struct sk_buff *skb, struct iov_iter *from, int copy) { const int off = skb->len; if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (csum_and_copy_from_iter_full(skb_put(skb, copy), copy, &csum, from)) { skb->csum = csum_block_add(skb->csum, csum, off); return 0; } } else if (copy_from_iter_full(skb_put(skb, copy), copy, from)) return 0; __skb_trim(skb, off); return -EFAULT; } static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { if (skb_zcopy(skb)) return false; if (i) { const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1]; return page == skb_frag_page(frag) && off == skb_frag_off(frag) + skb_frag_size(frag); } return false; } static inline int __skb_linearize(struct sk_buff *skb) { return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM; } /** * skb_linearize - convert paged skb to linear one * @skb: buffer to linarize * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ static inline int skb_linearize(struct sk_buff *skb) { return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0; } /** * skb_has_shared_frag - can any frag be overwritten * @skb: buffer to test * * Return true if the skb has at least one frag that might be modified * by an external entity (as in vmsplice()/sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) { return skb_is_nonlinear(skb) && skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; } /** * skb_linearize_cow - make sure skb is linear and writable * @skb: buffer to process * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ static inline int skb_linearize_cow(struct sk_buff *skb) { return skb_is_nonlinear(skb) || skb_cloned(skb) ? __skb_linearize(skb) : 0; } static __always_inline void __skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, unsigned int off) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_block_sub(skb->csum, csum_partial(start, len, 0), off); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } /** * skb_postpull_rcsum - update checksum for received skb after pull * @skb: buffer to update * @start: start of data before pull * @len: length of data pulled * * After doing a pull on a received packet, you need to call this to * update the CHECKSUM_COMPLETE checksum, or set ip_summed to * CHECKSUM_NONE so that it can be recomputed from scratch. */ static inline void skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = wsum_negate(csum_partial(start, len, wsum_negate(skb->csum))); else if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } static __always_inline void __skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len, unsigned int off) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_block_add(skb->csum, csum_partial(start, len, 0), off); } /** * skb_postpush_rcsum - update checksum for received skb after push * @skb: buffer to update * @start: start of data after push * @len: length of data pushed * * After doing a push on a received packet, you need to call this to * update the CHECKSUM_COMPLETE checksum. */ static inline void skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) { __skb_postpush_rcsum(skb, start, len, 0); } void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); /** * skb_push_rcsum - push skb and update receive checksum * @skb: buffer to update * @len: length of data pulled * * This function performs an skb_push on the packet and updates * the CHECKSUM_COMPLETE checksum. It should be used on * receive path processing instead of skb_push unless you know * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len) { skb_push(skb, len); skb_postpush_rcsum(skb, skb->data, len); return skb->data; } int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); /** * pskb_trim_rcsum - trim received skb and update checksum * @skb: buffer to trim * @len: new length * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. * It can change skb pointers. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (likely(len >= skb->len)) return 0; return pskb_trim_rcsum_slow(skb, len); } static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; __skb_trim(skb, len); return 0; } static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; return __skb_grow(skb, len); } #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) #define skb_rb_first(root) rb_to_skb(rb_first(root)) #define skb_rb_last(root) rb_to_skb(rb_last(root)) #define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) #define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ skb = skb->next) #define skb_queue_walk_safe(queue, skb, tmp) \ for (skb = (queue)->next, tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->next) #define skb_queue_walk_from(queue, skb) \ for (; skb != (struct sk_buff *)(queue); \ skb = skb->next) #define skb_rbtree_walk(skb, root) \ for (skb = skb_rb_first(root); skb != NULL; \ skb = skb_rb_next(skb)) #define skb_rbtree_walk_from(skb) \ for (; skb != NULL; \ skb = skb_rb_next(skb)) #define skb_rbtree_walk_from_safe(skb, tmp) \ for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \ skb = tmp) #define skb_queue_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->next) #define skb_queue_reverse_walk(queue, skb) \ for (skb = (queue)->prev; \ skb != (struct sk_buff *)(queue); \ skb = skb->prev) #define skb_queue_reverse_walk_safe(queue, skb, tmp) \ for (skb = (queue)->prev, tmp = skb->prev; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->prev) #define skb_queue_reverse_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->prev; \ skb != (struct sk_buff *)(queue); \ skb = tmp, tmp = skb->prev) static inline bool skb_has_frag_list(const struct sk_buff *skb) { return skb_shinfo(skb)->frag_list != NULL; } static inline void skb_frag_list_init(struct sk_buff *skb) { skb_shinfo(skb)->frag_list = NULL; } #define skb_walk_frags(skb, iter) \ for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_try_recv_datagram(struct sock *sk, struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err); __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, struct msghdr *msg, int size) { return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size); } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len); static inline void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) { __skb_free_datagram_locked(sk, skb, 0); } int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len); __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len); int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int len, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen); void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features, unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); int skb_eth_pop(struct sk_buff *skb); int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, const unsigned char *src); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, bool ethernet); int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); int skb_mpls_dec_ttl(struct sk_buff *skb); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, gfp_t gfp); static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT; } static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) { return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; } struct skb_checksum_ops { __wsum (*update)(const void *mem, int len, __wsum wsum); __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); }; extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly; __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); static inline void * __must_check __skb_header_pointer(const struct sk_buff *skb, int offset, int len, const void *data, int hlen, void *buffer) { if (likely(hlen - offset >= len)) return (void *)data + offset; if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0)) return NULL; return buffer; } static inline void * __must_check skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) { return __skb_header_pointer(skb, offset, len, skb->data, skb_headlen(skb), buffer); } static inline void * __must_check skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len) { if (likely(skb_headlen(skb) - offset >= len)) return skb->data + offset; return NULL; } /** * skb_needs_linearize - check if we need to linearize a given skb * depending on the given device features. * @skb: socket buffer to check * @features: net device features * * Returns true if either: * 1. skb has frag_list and the device doesn't support FRAGLIST, or * 2. skb is fragmented and the device does not support SG. */ static inline bool skb_needs_linearize(struct sk_buff *skb, netdev_features_t features) { return skb_is_nonlinear(skb) && ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG))); } static inline void skb_copy_from_linear_data(const struct sk_buff *skb, void *to, const unsigned int len) { memcpy(to, skb->data, len); } static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb, const int offset, void *to, const unsigned int len) { memcpy(to, skb->data + offset, len); } static inline void skb_copy_to_linear_data(struct sk_buff *skb, const void *from, const unsigned int len) { memcpy(skb->data, from, len); } static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, const int offset, const void *from, const unsigned int len) { memcpy(skb->data + offset, from, len); } void skb_init(void); static inline ktime_t skb_get_ktime(const struct sk_buff *skb) { return skb->tstamp; } /** * skb_get_timestamp - get timestamp from a skb * @skb: skb to get stamp from * @stamp: pointer to struct __kernel_old_timeval to store stamp in * * Timestamps are stored in the skb as offsets to a base timestamp. * This function converts the offset back to a struct timeval and stores * it in stamp. */ static inline void skb_get_timestamp(const struct sk_buff *skb, struct __kernel_old_timeval *stamp) { *stamp = ns_to_kernel_old_timeval(skb->tstamp); } static inline void skb_get_new_timestamp(const struct sk_buff *skb, struct __kernel_sock_timeval *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_usec = ts.tv_nsec / 1000; } static inline void skb_get_timestampns(const struct sk_buff *skb, struct __kernel_old_timespec *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_nsec = ts.tv_nsec; } static inline void skb_get_new_timestampns(const struct sk_buff *skb, struct __kernel_timespec *stamp) { struct timespec64 ts = ktime_to_timespec64(skb->tstamp); stamp->tv_sec = ts.tv_sec; stamp->tv_nsec = ts.tv_nsec; } static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); skb->mono_delivery_time = 0; } static inline ktime_t net_timedelta(ktime_t t) { return ktime_sub(ktime_get_real(), t); } static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, bool mono) { skb->tstamp = kt; skb->mono_delivery_time = kt && mono; } DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); /* It is used in the ingress path to clear the delivery_time. * If needed, set the skb->tstamp to the (rcv) timestamp. */ static inline void skb_clear_delivery_time(struct sk_buff *skb) { if (skb->mono_delivery_time) { skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) skb->tstamp = ktime_get_real(); else skb->tstamp = 0; } } static inline void skb_clear_tstamp(struct sk_buff *skb) { if (skb->mono_delivery_time) return; skb->tstamp = 0; } static inline ktime_t skb_tstamp(const struct sk_buff *skb) { if (skb->mono_delivery_time) return 0; return skb->tstamp; } static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) { if (!skb->mono_delivery_time && skb->tstamp) return skb->tstamp; if (static_branch_unlikely(&netstamp_needed_key) || cond) return ktime_get_real(); return 0; } static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; } static inline void *skb_metadata_end(const struct sk_buff *skb) { return skb_mac_header(skb); } static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, const struct sk_buff *skb_b, u8 meta_len) { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); /* Using more efficient varaiant than plain call to memcmp(). */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 diffs = 0; switch (meta_len) { #define __it(x, op) (x -= sizeof(u##op)) #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) case 32: diffs |= __it_diff(a, b, 64); fallthrough; case 24: diffs |= __it_diff(a, b, 64); fallthrough; case 16: diffs |= __it_diff(a, b, 64); fallthrough; case 8: diffs |= __it_diff(a, b, 64); break; case 28: diffs |= __it_diff(a, b, 64); fallthrough; case 20: diffs |= __it_diff(a, b, 64); fallthrough; case 12: diffs |= __it_diff(a, b, 64); fallthrough; case 4: diffs |= __it_diff(a, b, 32); break; } return diffs; #else return memcmp(a - meta_len, b - meta_len, meta_len); #endif } static inline bool skb_metadata_differs(const struct sk_buff *skb_a, const struct sk_buff *skb_b) { u8 len_a = skb_metadata_len(skb_a); u8 len_b = skb_metadata_len(skb_b); if (!(len_a | len_b)) return false; return len_a != len_b ? true : __skb_metadata_differs(skb_a, skb_b, len_a); } static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len) { skb_shinfo(skb)->meta_len = meta_len; } static inline void skb_metadata_clear(struct sk_buff *skb) { skb_metadata_set(skb, 0); } struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING void skb_clone_tx_timestamp(struct sk_buff *skb); bool skb_defer_rx_timestamp(struct sk_buff *skb); #else /* CONFIG_NETWORK_PHY_TIMESTAMPING */ static inline void skb_clone_tx_timestamp(struct sk_buff *skb) { } static inline bool skb_defer_rx_timestamp(struct sk_buff *skb) { return false; } #endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */ /** * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps * * PHY drivers may accept clones of transmitted packets for * timestamping via their phy_driver.txtstamp method. These drivers * must call this function to return the skb back to the stack with a * timestamp. * * @skb: clone of the original outgoing packet * @hwtstamps: hardware time stamps * */ void skb_complete_tx_timestamp(struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps); void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, struct sock *sk, int tstype); /** * skb_tstamp_tx - queue clone of skb with send time stamps * @orig_skb: the original outgoing packet * @hwtstamps: hardware time stamps, may be NULL if not available * * If the skb has a socket associated, then this function clones the * skb (thus sharing the actual data and optional structures), stores * the optional hardware time stamping information (if non NULL) or * generates a software time stamp (otherwise), then queues the clone * to the error queue of the socket. Errors are silently ignored. */ void skb_tstamp_tx(struct sk_buff *orig_skb, struct skb_shared_hwtstamps *hwtstamps); /** * skb_tx_timestamp() - Driver hook for transmit timestamping * * Ethernet MAC Drivers should call this function in their hard_xmit() * function immediately before giving the sk_buff to the MAC hardware. * * Specifically, one should make absolutely sure that this function is * called before TX completion of this packet can trigger. Otherwise * the packet could potentially already be freed. * * @skb: A socket buffer. */ static inline void skb_tx_timestamp(struct sk_buff *skb) { skb_clone_tx_timestamp(skb); if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) skb_tstamp_tx(skb, NULL); } /** * skb_complete_wifi_ack - deliver skb with wifi status * * @skb: the original outgoing packet * @acked: ack status * */ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked); __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); __sum16 __skb_checksum_complete(struct sk_buff *skb); static inline int skb_csum_unnecessary(const struct sk_buff *skb) { return ((skb->ip_summed == CHECKSUM_UNNECESSARY) || skb->csum_valid || (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start_offset(skb) >= 0)); } /** * skb_checksum_complete - Calculate checksum of an entire packet * @skb: packet to process * * This function calculates the checksum over the entire packet plus * the value of skb->csum. The latter can be used to supply the * checksum of a pseudo header as used by TCP/UDP. It returns the * checksum. * * For protocols that contain complete checksums such as ICMP/TCP/UDP, * this function can be used to verify that checksum on received * packets. In that case the function should return zero if the * checksum is correct. In particular, this function will return zero * if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the * hardware has already verified the correctness of the checksum. */ static inline __sum16 skb_checksum_complete(struct sk_buff *skb) { return skb_csum_unnecessary(skb) ? 0 : __skb_checksum_complete(skb); } static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { if (skb->csum_level == 0) skb->ip_summed = CHECKSUM_NONE; else skb->csum_level--; } } static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { if (skb->csum_level < SKB_MAX_CSUM_LEVEL) skb->csum_level++; } else if (skb->ip_summed == CHECKSUM_NONE) { skb->ip_summed = CHECKSUM_UNNECESSARY; skb->csum_level = 0; } } static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_UNNECESSARY) { skb->ip_summed = CHECKSUM_NONE; skb->csum_level = 0; } } /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise * (either checksum is unnecessary or zero checksum is allowed). */ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, bool zero_okay, __sum16 check) { if (skb_csum_unnecessary(skb) || (zero_okay && !check)) { skb->csum_valid = 1; __skb_decr_checksum_unnecessary(skb); return false; } return true; } /* For small packets <= CHECKSUM_BREAK perform checksum complete directly * in checksum_init. */ #define CHECKSUM_BREAK 76 /* Unset checksum-complete * * Unset checksum complete can be done when packet is being modified * (uncompressed for instance) and checksum-complete value is * invalidated. */ static inline void skb_checksum_complete_unset(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; } /* Validate (init) checksum based on checksum complete. * * Return values: * 0: checksum is validated or try to in skb_checksum_complete. In the latter * case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo * checksum is stored in skb->csum for use in __skb_checksum_complete * non-zero: value of invalid checksum * */ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb, bool complete, __wsum psum) { if (skb->ip_summed == CHECKSUM_COMPLETE) { if (!csum_fold(csum_add(psum, skb->csum))) { skb->csum_valid = 1; return 0; } } skb->csum = psum; if (complete || skb->len <= CHECKSUM_BREAK) { __sum16 csum; csum = __skb_checksum_complete(skb); skb->csum_valid = !csum; return csum; } return 0; } static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) { return 0; } /* Perform checksum validate (init). Note that this is a macro since we only * want to calculate the pseudo header which is an input function if necessary. * First we try to validate without any computation (checksum unnecessary) and * then calculate based on checksum complete calling the function to compute * pseudo header. * * Return values: * 0: checksum is validated or try to in skb_checksum_complete * non-zero: value of invalid checksum */ #define __skb_checksum_validate(skb, proto, complete, \ zero_okay, check, compute_pseudo) \ ({ \ __sum16 __ret = 0; \ skb->csum_valid = 0; \ if (__skb_checksum_validate_needed(skb, zero_okay, check)) \ __ret = __skb_checksum_validate_complete(skb, \ complete, compute_pseudo(skb, proto)); \ __ret; \ }) #define skb_checksum_init(skb, proto, compute_pseudo) \ __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo) #define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo) \ __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo) #define skb_checksum_validate(skb, proto, compute_pseudo) \ __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo) #define skb_checksum_validate_zero_check(skb, proto, check, \ compute_pseudo) \ __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo) #define skb_checksum_simple_validate(skb) \ __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo) static inline bool __skb_checksum_convert_check(struct sk_buff *skb) { return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid); } static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo) { skb->csum = ~pseudo; skb->ip_summed = CHECKSUM_COMPLETE; } #define skb_checksum_try_convert(skb, proto, compute_pseudo) \ do { \ if (__skb_checksum_convert_check(skb)) \ __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \ } while (0) static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr, u16 start, u16 offset) { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = ((unsigned char *)ptr + start) - skb->head; skb->csum_offset = offset - start; } /* Update skbuf and packet to reflect the remote checksum offload operation. * When called, ptr indicates the starting point for skb->csum when * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete * here, skb_postpull_rcsum is done so skb->csum start is ptr. */ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr, int start, int offset, bool nopartial) { __wsum delta; if (!nopartial) { skb_remcsum_adjust_partial(skb, ptr, start, offset); return; } if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { __skb_checksum_complete(skb); skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data); } delta = remcsum_adjust(ptr, skb->csum, start, offset); /* Adjust skb->csum since we changed the packet */ skb->csum = csum_add(skb->csum, delta); } static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return (void *)(skb->_nfct & NFCT_PTRMASK); #else return NULL; #endif } static inline unsigned long skb_get_nfct(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) return skb->_nfct; #else return 0UL; #endif } static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) skb->slow_gro |= !!nfct; skb->_nfct = nfct; #endif } #ifdef CONFIG_SKB_EXTENSIONS enum skb_ext_id { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) SKB_EXT_BRIDGE_NF, #endif #ifdef CONFIG_XFRM SKB_EXT_SEC_PATH, #endif #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) TC_SKB_EXT, #endif #if IS_ENABLED(CONFIG_MPTCP) SKB_EXT_MPTCP, #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, #endif SKB_EXT_NUM, /* must be last */ }; /** * struct skb_ext - sk_buff extensions * @refcnt: 1 on allocation, deallocated on 0 * @offset: offset to add to @data to obtain extension address * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units * @data: start of extension data, variable sized * * Note: offsets/lengths are stored in chunks of 8 bytes, this allows * to use 'u8' types while allowing up to 2kb worth of extension data. */ struct skb_ext { refcount_t refcnt; u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ u8 chunks; /* same */ char data[] __aligned(8); }; struct skb_ext *__skb_ext_alloc(gfp_t flags); void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); void __skb_ext_put(struct skb_ext *ext); static inline void skb_ext_put(struct sk_buff *skb) { if (skb->active_extensions) __skb_ext_put(skb->extensions); } static inline void __skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { dst->active_extensions = src->active_extensions; if (src->active_extensions) { struct skb_ext *ext = src->extensions; refcount_inc(&ext->refcnt); dst->extensions = ext; } } static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) { skb_ext_put(dst); __skb_ext_copy(dst, src); } static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) { return !!ext->offset[i]; } static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) { return skb->active_extensions & (1 << id); } static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { if (skb_ext_exist(skb, id)) __skb_ext_del(skb, id); } static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) { if (skb_ext_exist(skb, id)) { struct skb_ext *ext = skb->extensions; return (void *)ext + (ext->offset[id] << 3); } return NULL; } static inline void skb_ext_reset(struct sk_buff *skb) { if (unlikely(skb->active_extensions)) { __skb_ext_put(skb->extensions); skb->active_extensions = 0; } } static inline bool skb_has_extensions(struct sk_buff *skb) { return unlikely(skb->active_extensions); } #else static inline void skb_ext_put(struct sk_buff *skb) {} static inline void skb_ext_reset(struct sk_buff *skb) {} static inline void skb_ext_del(struct sk_buff *skb, int unused) {} static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} static inline bool skb_has_extensions(struct sk_buff *skb) { return false; } #endif /* CONFIG_SKB_EXTENSIONS */ static inline void nf_reset_ct(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(skb)); skb->_nfct = 0; #endif } static inline void nf_reset_trace(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) skb->nf_trace = 0; #endif } static inline void ipvs_reset(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IP_VS) skb->ipvs_property = 0; #endif } /* Note: This doesn't put any conntrack info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; #endif } static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); #endif dst->slow_gro = src->slow_gro; __nf_copy(dst, src, true); } #ifdef CONFIG_NETWORK_SECMARK static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) { to->secmark = from->secmark; } static inline void skb_init_secmark(struct sk_buff *skb) { skb->secmark = 0; } #else static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) { } static inline void skb_init_secmark(struct sk_buff *skb) { } #endif static inline int secpath_exists(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb_ext_exist(skb, SKB_EXT_SEC_PATH); #else return 0; #endif } static inline bool skb_irq_freeable(const struct sk_buff *skb) { return !skb->destructor && !secpath_exists(skb) && !skb_nfct(skb) && !skb->_skb_refdst && !skb_has_frag_list(skb); } static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) { skb->queue_mapping = queue_mapping; } static inline u16 skb_get_queue_mapping(const struct sk_buff *skb) { return skb->queue_mapping; } static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from) { to->queue_mapping = from->queue_mapping; } static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) { skb->queue_mapping = rx_queue + 1; } static inline u16 skb_get_rx_queue(const struct sk_buff *skb) { return skb->queue_mapping - 1; } static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) { return skb->queue_mapping != 0; } static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val) { skb->dst_pending_confirm = val; } static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) { return skb->dst_pending_confirm != 0; } static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb_ext_find(skb, SKB_EXT_SEC_PATH); #else return NULL; #endif } static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_v6(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } /* Note: Should be called only if skb_is_gso(skb) is true */ static inline bool skb_is_gso_tcp(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); } static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_segs = 0; skb_shinfo(skb)->gso_type = 0; } static inline void skb_increase_gso_size(struct skb_shared_info *shinfo, u16 increment) { if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) return; shinfo->gso_size += increment; } static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo, u16 decrement) { if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS)) return; shinfo->gso_size -= decrement; } void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) { /* LRO sets gso_size but not gso_type, whereas if GSO is really * wanted then gso_type will be set. */ const struct skb_shared_info *shinfo = skb_shinfo(skb); if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 && unlikely(shinfo->gso_type == 0)) { __skb_warn_lro_forwarding(skb); return true; } return false; } static inline void skb_forward_csum(struct sk_buff *skb) { /* Unfortunately we don't support this one. Any brave souls? */ if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; } /** * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE * @skb: skb to check * * fresh skbs have their ip_summed set to CHECKSUM_NONE. * Instead of forcing ip_summed to CHECKSUM_NONE, we can * use this helper, to document places where we make this assertion. */ static inline void skb_checksum_none_assert(const struct sk_buff *skb) { DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE); } bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off); int skb_checksum_setup(struct sk_buff *skb, bool recalculate); struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, unsigned int transport_len, __sum16(*skb_chkf)(struct sk_buff *skb)); /** * skb_head_is_locked - Determine if the skb->head is locked down * @skb: skb to check * * The head on skbs build around a head frag can be removed if they are * not cloned. This function returns true if the skb head is locked down * due to either being allocated via kmalloc, or by being a clone with * multiple references to the head. */ static inline bool skb_head_is_locked(const struct sk_buff *skb) { return !skb->head_frag || skb_cloned(skb); } /* Local Checksum Offload. * Compute outer checksum based on the assumption that the * inner checksum will be offloaded later. * See Documentation/networking/checksum-offloads.rst for * explanation of how this works. * Fill in outer checksum adjustment (e.g. with sum of outer * pseudo-header) before calling. * Also ensure that inner checksum is in linear data area. */ static inline __wsum lco_csum(struct sk_buff *skb) { unsigned char *csum_start = skb_checksum_start(skb); unsigned char *l4_hdr = skb_transport_header(skb); __wsum partial; /* Start with complement of inner checksum adjustment */ partial = ~csum_unfold(*(__force __sum16 *)(csum_start + skb->csum_offset)); /* Add in checksum of our headers (incl. outer checksum * adjustment filled in by caller) and return result. */ return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } static inline bool skb_is_redirected(const struct sk_buff *skb) { return skb->redirected; } static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) { skb->redirected = 1; #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) skb_clear_tstamp(skb); #endif } static inline void skb_reset_redirect(struct sk_buff *skb) { skb->redirected = 0; } static inline void skb_set_redirected_noclear(struct sk_buff *skb, bool from_ingress) { skb->redirected = 1; #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; #endif } static inline bool skb_csum_is_sctp(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IP_SCTP) return skb->csum_not_inet; #else return 0; #endif } static inline void skb_reset_csum_not_inet(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; #if IS_ENABLED(CONFIG_IP_SCTP) skb->csum_not_inet = 0; #endif } static inline void skb_set_kcov_handle(struct sk_buff *skb, const u64 kcov_handle) { #ifdef CONFIG_KCOV skb->kcov_handle = kcov_handle; #endif } static inline u64 skb_get_kcov_handle(struct sk_buff *skb) { #ifdef CONFIG_KCOV return skb->kcov_handle; #else return 0; #endif } static inline void skb_mark_for_recycle(struct sk_buff *skb) { #ifdef CONFIG_PAGE_POOL skb->pp_recycle = 1; #endif } ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp); #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */
7365 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 /* SPDX-License-Identifier: GPL-2.0 */ /* rwsem.h: R/W semaphores, public interface * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h */ #ifndef _LINUX_RWSEM_H #define _LINUX_RWSEM_H #include <linux/linkage.h> #include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/cleanup.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ }, #else # define __RWSEM_DEP_MAP_INIT(lockname) #endif #ifndef CONFIG_PREEMPT_RT #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include <linux/osq_lock.h> #endif /* * For an uncontended rwsem, count and owner are the only fields a task * needs to touch when acquiring the rwsem. So they are put next to each * other to increase the chance that they will share the same cacheline. * * In a contended rwsem, the owner is likely the most frequently accessed * field in the structure as the optimistic waiter that holds the osq lock * will spin on owner. For an embedded rwsem, other hot fields in the * containing structure should be moved further away from the rwsem to * reduce the chance that they will share the same cacheline causing * cacheline bouncing problem. */ struct rw_semaphore { atomic_long_t count; /* * Write owner or one of the read owners as well flags regarding * the current state of the rwsem. Can be used as a speculative * check to see if the write owner is running on the cpu. */ atomic_long_t owner; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) { return atomic_long_read(&sem->count) != 0; } #define RWSEM_UNLOCKED_VALUE 0L #define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_RWSEMS # define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname, #else # define __RWSEM_DEBUG_INIT(lockname) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED, #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ { __RWSEM_COUNT_INIT(name), \ .owner = ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ .wait_list = LIST_HEAD_INIT((name).wait_list), \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) extern void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) /* * This is the same regardless of which rwsem implementation that is being used. * It is just a heuristic meant to be called by somebody already holding the * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ static inline int rwsem_is_contended(struct rw_semaphore *sem) { return !list_empty(&sem->wait_list); } #else /* !CONFIG_PREEMPT_RT */ #include <linux/rwbase_rt.h> struct rw_semaphore { struct rwbase_rt rwbase; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #define __RWSEM_INITIALIZER(name) \ { \ .rwbase = __RWBASE_INITIALIZER(name), \ __RWSEM_DEP_MAP_INIT(name) \ } #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) { return rw_base_is_locked(&sem->rwbase); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) { return rw_base_is_contended(&sem->rwbase); } #endif /* CONFIG_PREEMPT_RT */ /* * The functions below are the same for all rwsem implementations including * the RT specific variant. */ /* * lock for reading */ extern void down_read(struct rw_semaphore *sem); extern int __must_check down_read_interruptible(struct rw_semaphore *sem); extern int __must_check down_read_killable(struct rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ extern int down_read_trylock(struct rw_semaphore *sem); /* * lock for writing */ extern void down_write(struct rw_semaphore *sem); extern int __must_check down_write_killable(struct rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ extern int down_write_trylock(struct rw_semaphore *sem); /* * release a read lock */ extern void up_read(struct rw_semaphore *sem); /* * release a write lock */ extern void up_write(struct rw_semaphore *sem); DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T)) DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) DEFINE_FREE(up_read, struct rw_semaphore *, if (_T) up_read(_T)) DEFINE_FREE(up_write, struct rw_semaphore *, if (_T) up_write(_T)) /* * downgrade write lock to read lock */ extern void downgrade_write(struct rw_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * nested locking. NOTE: rwsems are not allowed to recurse * (which occurs if the same task tries to acquire the same * lock instance multiple times), but multiple locks of the * same lock class might be taken, if the order of the locks * is always the same. This ordering rule can be expressed * to lockdep via the _nested() APIs, but enumerating the * subclasses that are used. (If the nesting relationship is * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. * See Documentation/locking/lockdep-design.rst for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); # define down_write_nest_lock(sem, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ } while (0) /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ extern void down_read_non_owner(struct rw_semaphore *sem); extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_read_killable_nested(sem, subclass) down_read_killable(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) # define down_write_killable_nested(sem, subclass) down_write_killable(sem) # define down_read_non_owner(sem) down_read(sem) # define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */
1 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0 */ /* include/net/dsfield.h - Manipulation of the Differentiated Services field */ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ #ifndef __NET_DSFIELD_H #define __NET_DSFIELD_H #include <linux/types.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <asm/byteorder.h> static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) { return iph->tos; } static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { return ntohs(*(__force const __be16 *)ipv6h) >> 4; } static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask, __u8 value) { __u32 check = ntohs((__force __be16)iph->check); __u8 dsfield; dsfield = (iph->tos & mask) | value; check += iph->tos; if ((check+1) >> 16) check = (check+1) & 0xffff; check -= dsfield; check += check >> 16; /* adjust carry */ iph->check = (__force __sum16)htons(check); iph->tos = dsfield; } static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, __u8 value) { __be16 *p = (__force __be16 *)ipv6h; *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4); } #endif
48 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 // SPDX-License-Identifier: GPL-2.0 /* * Shared application/kernel submission and completion ring pairs, for * supporting fast/efficient IO. * * A note on the read/write ordering memory barriers that are matched between * the application and kernel side. * * After the application reads the CQ ring tail, it must use an * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses * before writing the tail (using smp_load_acquire to read the tail will * do). It also needs a smp_mb() before updating CQ head (ordering the * entry load(s) with the head store), pairing with an implicit barrier * through a control-dependency in io_get_cqe (smp_store_release to * store head will do). Failure to do so could lead to reading invalid * CQ entries. * * Likewise, the application must use an appropriate smp_wmb() before * writing the SQ tail (ordering SQ entry stores with the tail store), * which pairs with smp_load_acquire in io_get_sqring (smp_store_release * to store the tail will do). And it needs a barrier ordering the SQ * head load before writing new SQ entries (smp_load_acquire to read * head will do). * * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* * updating the SQ tail; a full memory barrier smp_mb() is needed * between. * * Also see the examples in the liburing library: * * git://git.kernel.dk/liburing * * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens * from data shared between the kernel and application. This is done both * for ordering purposes, but also to ensure that once a value is loaded from * data that the application could potentially modify, it remains stable. * * Copyright (C) 2018-2019 Jens Axboe * Copyright (c) 2018-2019 Christoph Hellwig */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/syscalls.h> #include <net/compat.h> #include <linux/refcount.h> #include <linux/uio.h> #include <linux/bits.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/bvec.h> #include <linux/net.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/scm.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <linux/nospec.h> #include <linux/highmem.h> #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/task_work.h> #include <linux/io_uring.h> #include <linux/audit.h> #include <linux/security.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io-wq.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" #include "tctx.h" #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" #include "rsrc.h" #include "cancel.h" #include "net.h" #include "notif.h" #include "waitid.h" #include "futex.h" #include "timeout.h" #include "poll.h" #include "rw.h" #include "alloc_cache.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, }; enum { IO_EVENTFD_OP_SIGNAL_BIT, IO_EVENTFD_OP_FREE_BIT, }; struct io_defer_entry { struct list_head list; struct io_kiocb *req; u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); static void io_queue_sqe(struct io_kiocb *req); struct kmem_cache *req_cachep; static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL static struct ctl_table kernel_io_uring_disabled_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, .maxlen = sizeof(sysctl_io_uring_disabled), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "io_uring_group", .data = &sysctl_io_uring_group, .maxlen = sizeof(gid_t), .mode = 0644, .proc_handler = proc_dointvec, }, {}, }; #endif struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) if (io_is_uring_fops(file)) { struct io_ring_ctx *ctx = file->private_data; return ctx->ring_sock->sk; } #endif return NULL; } EXPORT_SYMBOL(io_uring_get_socket); static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || ctx->submit_state.cqes_count) __io_submit_flush_completions(ctx); } static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) { return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); } static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; io_for_each_link(req, head) { if (req->flags & REQ_F_INFLIGHT) return true; } return false; } /* * As io_match_task() but protected against racing with linked timeouts. * User must not hold timeout_lock. */ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all) { bool matched; if (task && head->task != task) return false; if (cancel_all) return true; if (head->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } return matched; } static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); io_req_set_res(req, res, 0); } static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); complete(&ctx->ref_comp); } static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; struct io_tw_state ts = { .locked = true, }; mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &ts); if (WARN_ON_ONCE(!ts.locked)) return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); } static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { unsigned hash_buckets = 1U << bits; size_t hash_size = hash_buckets * sizeof(table->hbs[0]); table->hbs = kmalloc(hash_size, GFP_KERNEL); if (!table->hbs) return -ENOMEM; table->hash_bits = bits; init_hash_table(table, hash_buckets); return 0; } static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; xa_init(&ctx->io_bl_xa); /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread, but * don't keep too many buckets to not overconsume memory. */ hash_bits = ilog2(p->cq_entries) - 5; hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) goto err; if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, sizeof(struct async_poll)); io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); io_futex_cache_init(ctx); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); init_waitqueue_head(&ctx->rsrc_quiesce_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); INIT_LIST_HEAD(&ctx->rsrc_ref_list); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_WQ_LIST(&ctx->locked_free_list); INIT_HLIST_HEAD(&ctx->waitid_list); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); return ctx; err: kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } static void io_account_cq_overflow(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); ctx->cq_extra--; } static bool req_need_defer(struct io_kiocb *req, u32 seq) { if (unlikely(req->flags & REQ_F_IO_DRAIN)) { struct io_ring_ctx *ctx = req->ctx; return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; } return false; } static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { spin_lock(&req->ctx->completion_lock); io_put_kbuf_comp(req); spin_unlock(&req->ctx->completion_lock); } if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; if (def->cleanup) def->cleanup(req); } if ((req->flags & REQ_F_POLLED) && req->apoll) { kfree(req->apoll->double_poll); kfree(req->apoll); req->apoll = NULL; } if (req->flags & REQ_F_INFLIGHT) { struct io_uring_task *tctx = req->task->io_uring; atomic_dec(&tctx->inflight_tracked); } if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { kfree(req->async_data); req->async_data = NULL; } req->flags &= ~IO_REQ_CLEAN_FLAGS; } static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; atomic_inc(&req->task->io_uring->inflight_tracked); } } static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) return NULL; req->flags &= ~REQ_F_ARM_LTIMEOUT; req->flags |= REQ_F_LINK_TIMEOUT; /* linked timeouts should have two refs once prep'ed */ io_req_set_refcount(req); __io_req_set_refcount(req->link, 2); return req->link; } static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) return NULL; return __io_prep_linked_timeout(req); } static noinline void __io_arm_ltimeout(struct io_kiocb *req) { io_queue_linked_timeout(__io_prep_linked_timeout(req)); } static inline void io_arm_ltimeout(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) __io_arm_ltimeout(req); } static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; req->creds = get_current_cred(); } req->work.list.next = NULL; req->work.flags = 0; req->work.cancel_seq = atomic_read(&ctx->cancel_seq); if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; if (req->file && !(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(req->file); if (req->file && (req->flags & REQ_F_ISREG)) { bool should_hash = def->hash_reg_file; /* don't serialize this request if the fs doesn't need it */ if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) should_hash = false; if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } } static void io_prep_async_link(struct io_kiocb *req) { struct io_kiocb *cur; if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); } } void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; BUG_ON(!tctx); BUG_ON(!tctx->io_wq); /* init ->work of the whole link before punting */ io_prep_async_link(req); /* * Not expected to happen, but if we do have a bug where this _can_ * happen, catch it here and ensure the request is marked as * canceled. That will make io-wq go through the usual work cancel * procedure rather than attempt to run this request (or create a new * worker for it). */ if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); } static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); if (req_need_defer(de->req, de->seq)) break; list_del_init(&de->list); io_req_task_queue(de->req); kfree(de); } } static void io_eventfd_ops(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); int ops = atomic_xchg(&ev_fd->ops, 0); if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback * ordering in a race but if references are 0 we know we have to free * it regardless. */ if (atomic_dec_and_test(&ev_fd->refs)) { eventfd_ctx_put(ev_fd->cq_ev_fd); kfree(ev_fd); } } static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd = NULL; rcu_read_lock(); /* * rcu_dereference ctx->io_ev_fd once and use it for both for checking * and eventfd_signal */ ev_fd = rcu_dereference(ctx->io_ev_fd); /* * Check again if ev_fd exists incase an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ if (unlikely(!ev_fd)) goto out; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) goto out; if (ev_fd->eventfd_async && !io_wq_current_is_worker()) goto out; if (likely(eventfd_signal_allowed())) { eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); } else { atomic_inc(&ev_fd->refs); if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); else atomic_dec(&ev_fd->refs); } out: rcu_read_unlock(); } static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) { bool skip; spin_lock(&ctx->completion_lock); /* * Eventfd should only get triggered when at least one event has been * posted. Some applications rely on the eventfd notification count * only changing IFF a new CQE has been added to the CQ ring. There's * no depedency on 1:1 relationship between how many times this * function is called (and hence the eventfd count) and number of CQEs * posted to the CQ ring. */ skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; ctx->evfd_last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); if (skip) return; io_eventfd_signal(ctx); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->poll_activated) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); if (ctx->drain_active) { spin_lock(&ctx->completion_lock); io_queue_deferred(ctx); spin_unlock(&ctx->completion_lock); } if (ctx->has_evfd) io_eventfd_flush_signal(ctx); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) { if (!ctx->lockless_cq) spin_lock(&ctx->completion_lock); } static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { spin_lock(&ctx->completion_lock); } static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); if (!ctx->task_complete) { if (!ctx->lockless_cq) spin_unlock(&ctx->completion_lock); /* IOPOLL rings only need to wake up if it's also SQPOLL */ if (!ctx->syscall_iopoll) io_cqring_wake(ctx); } io_commit_cqring_flush(ctx); } static void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_wake(ctx); io_commit_cqring_flush(ctx); } /* Returns true if there are no backlogged entries after the flush */ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) { struct io_overflow_cqe *ocqe; LIST_HEAD(list); spin_lock(&ctx->completion_lock); list_splice_init(&ctx->cq_overflow_list, &list); clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); spin_unlock(&ctx->completion_lock); while (!list_empty(&list)) { ocqe = list_first_entry(&list, struct io_overflow_cqe, list); list_del(&ocqe->list); kfree(ocqe); } } static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { size_t cqe_size = sizeof(struct io_uring_cqe); if (__io_cqring_events(ctx) == ctx->cq_entries) return; if (ctx->flags & IORING_SETUP_CQE32) cqe_size <<= 1; io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; if (!io_get_cqe_overflow(ctx, &cqe, true)) break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); memcpy(cqe, &ocqe->cqe, cqe_size); list_del(&ocqe->list); kfree(ocqe); } if (list_empty(&ctx->cq_overflow_list)) { clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } io_cq_unlock_post(ctx); } static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); __io_cqring_overflow_flush(ctx); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) { if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) io_cqring_do_overflow_flush(ctx); } /* can be called by any task */ static void io_put_task_remote(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; percpu_counter_sub(&tctx->inflight, 1); if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); put_task_struct(task); } /* used by a task to put its own references */ static void io_put_task_local(struct task_struct *task) { task->io_uring->cached_refs++; } /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct task_struct *task) { if (likely(task == current)) io_put_task_local(task); else io_put_task_remote(task); } void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; percpu_counter_add(&tctx->inflight, refill); refcount_add(refill, &current->usage); tctx->cached_refs += refill; } static __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; if (refs) { tctx->cached_refs = 0; percpu_counter_sub(&tctx->inflight, refs); put_task_struct_many(task, refs); } } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, u64 extra1, u64 extra2) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); lockdep_assert_held(&ctx->completion_lock); if (is_cqe32) ocq_size += sizeof(struct io_uring_cqe); ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ io_account_cq_overflow(ctx); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } if (list_empty(&ctx->cq_overflow_list)) { set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; ocqe->cqe.flags = cflags; if (is_cqe32) { ocqe->cqe.big_cqe[0] = extra1; ocqe->cqe.big_cqe[1] = extra2; } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } void io_req_cqe_overflow(struct io_kiocb *req) { io_cqring_event_overflow(req->ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags, req->big_cqe.extra1, req->big_cqe.extra2); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); unsigned int free, queued, len; /* * Posting into the CQ when there are pending overflowed CQEs may break * ordering guarantees, which will affect links, F_MORE users and more. * Force overflow the completion. */ if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (!len) return false; if (ctx->flags & IORING_SETUP_CQE32) { off <<= 1; len <<= 1; } ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; return true; } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { struct io_uring_cqe *cqe; ctx->cq_extra++; /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in * the ring. */ if (likely(io_get_cqe(ctx, &cqe))) { trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); if (ctx->flags & IORING_SETUP_CQE32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } return true; } return false; } static void __io_flush_post_cqes(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; unsigned int i; lockdep_assert_held(&ctx->uring_lock); for (i = 0; i < state->cqes_count; i++) { struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_cqring_event_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, 0, 0); spin_unlock(&ctx->completion_lock); } else { io_cqring_event_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, 0, 0); } } } state->cqes_count = 0; } static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (!filled && allow_overflow) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); io_cq_unlock_post(ctx); return filled; } bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { return __io_post_aux_cqe(ctx, user_data, res, cflags, true); } /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; u64 user_data = req->cqe.user_data; struct io_uring_cqe *cqe; if (!defer) return __io_post_aux_cqe(ctx, user_data, res, cflags, false); lockdep_assert_held(&ctx->uring_lock); if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { __io_cq_lock(ctx); __io_flush_post_cqes(ctx); /* no need to flush - flush is deferred */ __io_cq_unlock_post(ctx); } /* For defered completions this is not as strict as it is otherwise, * however it's main job is to prevent unbounded posted completions, * and in that it works just as well. */ if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) return false; cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; cqe->user_data = user_data; cqe->res = res; cqe->flags = cflags; return true; } static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_rsrc_node *rsrc_node = NULL; io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) { if (!io_fill_cqe_req(ctx, req)) io_req_cqe_overflow(req); } /* * If we're the last reference to this request, add to our locked * free_list cache. */ if (req_ref_put_and_test(req)) { if (req->flags & IO_REQ_LINK_FLAGS) { if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { io_req_task_queue(req->link); req->link = NULL; } } io_put_kbuf_comp(req); if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); io_put_file(req); rsrc_node = req->rsrc_node; /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid * deadlocks. */ io_put_task_remote(req->task); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } io_cq_unlock_post(ctx); if (rsrc_node) { io_ring_submit_lock(ctx, issue_flags); io_put_rsrc_node(ctx, rsrc_node); io_ring_submit_unlock(ctx, issue_flags); } } void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { if (req->ctx->task_complete && req->ctx->submitter_task != current) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || !(req->ctx->flags & IORING_SETUP_IOPOLL)) { __io_req_complete_post(req, issue_flags); } else { struct io_ring_ctx *ctx = req->ctx; mutex_lock(&ctx->uring_lock); __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); mutex_unlock(&ctx->uring_lock); } } void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); if (def->fail) def->fail(req); io_req_complete_defer(req); } /* * Don't initialise the fields below on every allocation, but do that in * advance and keep them valid across allocations. */ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) { req->ctx = ctx; req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ memset(&req->cqe, 0, sizeof(req->cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_submit_state *state) { spin_lock(&ctx->completion_lock); wq_list_splice(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; spin_unlock(&ctx->completion_lock); } /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; int ret, i; /* * If we have more than a batch's worth of requests in our IRQ side * locked cache, grab the lock and move them over to our submission * side cache. */ if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { io_flush_cached_locked_reqs(ctx, &ctx->submit_state); if (!io_req_cache_empty(ctx)) return true; } ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); /* * Bulk alloc is all-or-nothing. If we fail to get a batch, * retry single alloc to be on the safe side. */ if (unlikely(ret <= 0)) { reqs[0] = kmem_cache_alloc(req_cachep, gfp); if (!reqs[0]) return false; ret = 1; } percpu_ref_get_many(&ctx->refs, ret); for (i = 0; i < ret; i++) { struct io_kiocb *req = reqs[i]; io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; } __cold void io_free_req(struct io_kiocb *req) { /* refs were already put, restore them for io_req_task_complete() */ req->flags &= ~REQ_F_REFCOUNT; /* we only want to free it, don't post CQEs */ req->flags |= REQ_F_CQE_SKIP; req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); io_disarm_next(req); spin_unlock(&ctx->completion_lock); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ if (unlikely(req->flags & IO_DISARM_MASK)) __io_req_find_next_prep(req); nxt = req->link; req->link = NULL; return nxt; } static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) { if (!ctx) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ts->locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); ts->locked = false; } percpu_ref_put(&ctx->refs); } static unsigned int handle_tw_list(struct llist_node *node, struct io_ring_ctx **ctx, struct io_tw_state *ts, struct llist_node *last) { unsigned int count = 0; while (node && node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); if (req->ctx != *ctx) { ctx_flush_and_put(*ctx, ts); *ctx = req->ctx; /* if not contended, grab and improve batching */ ts->locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, req, ts); node = next; count++; if (unlikely(need_resched())) { ctx_flush_and_put(*ctx, ts); *ctx = NULL; cond_resched(); } } return count; } /** * io_llist_xchg - swap all entries in a lock-less list * @head: the head of lock-less list to delete all entries * @new: new entry as the head of the list * * If list is empty, return NULL, otherwise, return the pointer to the first entry. * The order of entries returned is from the newest to the oldest added one. */ static inline struct llist_node *io_llist_xchg(struct llist_head *head, struct llist_node *new) { return xchg(&head->first, new); } /** * io_llist_cmpxchg - possibly swap all entries in a lock-less list * @head: the head of lock-less list to delete all entries * @old: expected old value of the first entry of the list * @new: new entry as the head of the list * * perform a cmpxchg on the first entry of the list. */ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, struct llist_node *old, struct llist_node *new) { return cmpxchg(&head->first, old, new); } static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) { struct llist_node *node = llist_del_all(&tctx->task_list); struct io_ring_ctx *last_ctx = NULL; struct io_kiocb *req; while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; if (sync && last_ctx != req->ctx) { if (last_ctx) { flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } last_ctx = req->ctx; percpu_ref_get(&last_ctx->refs); } if (llist_add(&req->io_task_work.node, &req->ctx->fallback_llist)) schedule_delayed_work(&req->ctx->fallback_work, 1); } if (last_ctx) { flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } } void tctx_task_work(struct callback_head *cb) { struct io_tw_state ts = {}; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); struct llist_node fake = {}; struct llist_node *node; unsigned int loops = 0; unsigned int count = 0; if (unlikely(current->flags & PF_EXITING)) { io_fallback_tw(tctx, true); return; } do { loops++; node = io_llist_xchg(&tctx->task_list, &fake); count += handle_tw_list(node, &ctx, &ts, &fake); /* skip expensive cmpxchg if there are items in the list */ if (READ_ONCE(tctx->task_list.first) != &fake) continue; if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { io_submit_flush_completions(ctx); if (READ_ONCE(tctx->task_list.first) != &fake) continue; } node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } while (node != &fake); ctx_flush_and_put(ctx, &ts); /* relaxed read is enough as only the task itself sets ->in_cancel */ if (unlikely(atomic_read(&tctx->in_cancel))) io_uring_drop_tctx_refs(current); trace_io_uring_task_work_run(tctx, count, loops); } static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *first; if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; first = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; if (first) { struct io_kiocb *first_req = container_of(first, struct io_kiocb, io_task_work.node); /* * Might be executed at any moment, rely on * SLAB_TYPESAFE_BY_RCU to keep it alive. */ nr_tw_prev = READ_ONCE(first_req->nr_tw); } nr_tw = nr_tw_prev + 1; /* Large enough to fail the nr_wait comparison below */ if (!(flags & IOU_F_TWQ_LAZY_WAKE)) nr_tw = -1U; req->nr_tw = nr_tw; req->io_task_work.node.next = first; } while (!try_cmpxchg(&ctx->work_llist.first, &first, &req->io_task_work.node)); if (!first) { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) io_eventfd_signal(ctx); } nr_wait = atomic_read(&ctx->cq_wait_nr); /* no one is waiting */ if (!nr_wait) return; /* either not enough or the previous add has already woken it up */ if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) return; /* pairs with set_current_state() in io_cqring_wait() */ smp_mb__after_atomic(); wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } static void io_req_normal_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; /* task_work already pending, we're done */ if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; io_fallback_tw(tctx, false); } void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { rcu_read_lock(); io_req_local_work_add(req, flags); rcu_read_unlock(); } else { io_req_normal_work_add(req); } } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { struct llist_node *node; node = llist_del_all(&ctx->work_llist); while (node) { struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; io_req_normal_work_add(req); } } static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) { struct llist_node *node; unsigned int loops = 0; int ret = 0; if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: /* * llists are in reverse order, flip it back the right way before * running the pending items. */ node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, req, ts); ret++; node = next; } loops++; if (!llist_empty(&ctx->work_llist)) goto again; if (ts->locked) { io_submit_flush_completions(ctx); if (!llist_empty(&ctx->work_llist)) goto again; } trace_io_uring_local_work_run(ctx, ret, loops); return ret; } static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { struct io_tw_state ts = { .locked = true, }; int ret; if (llist_empty(&ctx->work_llist)) return 0; ret = __io_run_local_work(ctx, &ts); /* shouldn't happen! */ if (WARN_ON_ONCE(!ts.locked)) mutex_lock(&ctx->uring_lock); return ret; } static int io_run_local_work(struct io_ring_ctx *ctx) { struct io_tw_state ts = {}; int ret; ts.locked = mutex_trylock(&ctx->uring_lock); ret = __io_run_local_work(ctx, &ts); if (ts.locked) mutex_unlock(&ctx->uring_lock); return ret; } static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); io_req_defer_failed(req, req->cqe.res); } void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req, ts); else io_queue_sqe(req); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) { io_req_set_res(req, ret, 0); req->io_task_work.func = io_req_task_cancel; io_req_task_work_add(req); } void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; io_req_task_work_add(req); } void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); if (nxt) io_req_task_queue(nxt); } static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { do { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) continue; } if ((req->flags & REQ_F_POLLED) && req->apoll) { struct async_poll *apoll = req->apoll; if (apoll->double_poll) kfree(apoll->double_poll); if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) kfree(apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) io_queue_next(req); if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); } io_put_file(req); io_req_put_rsrc_locked(req, ctx); io_put_task(req->task); node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); } void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; struct io_wq_work_node *node; __io_cq_lock(ctx); /* must come first to preserve CQE ordering in failure cases */ if (state->cqes_count) __io_flush_post_cqes(ctx); __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); if (!(req->flags & REQ_F_CQE_SKIP) && unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_req_cqe_overflow(req); spin_unlock(&ctx->completion_lock); } else { io_req_cqe_overflow(req); } } } __io_cq_unlock_post(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } } static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ smp_rmb(); return __io_cqring_events(ctx); } /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); while (!wq_list_empty(&ctx->iopoll_list)) { /* let it sleep and repeat later if can't complete a request */ if (io_do_iopoll(ctx, true) == 0) break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. * Also let task_work, etc. to progress by releasing the mutex */ if (need_resched()) { mutex_unlock(&ctx->uring_lock); cond_resched(); mutex_lock(&ctx->uring_lock); } } mutex_unlock(&ctx->uring_lock); } static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; unsigned long check_cq; if (!io_allowed_run_tw(ctx)) return -EEXIST; check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) __io_cqring_overflow_flush(ctx); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. */ if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) return -EBADR; } /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ if (io_cqring_events(ctx)) return 0; do { int ret = 0; /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets * issued. That app will hold the uring_lock for the duration * of the poll right here, so we need to take a breather every * now and then to ensure that the issue has a chance to add * the poll to the issued list. Otherwise we can spin here * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ if (wq_list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; (void) io_run_local_work_locked(ctx); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); } /* some requests don't go through iopoll_list */ if (tail != ctx->cached_cq_tail || wq_list_empty(&ctx->iopoll_list)) break; } ret = io_do_iopoll(ctx, !min); if (unlikely(ret < 0)) return ret; if (task_sigpending(current)) return -EINTR; if (need_resched()) break; nr_events += ret; } while (nr_events < min); return 0; } void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { if (ts->locked) io_req_complete_defer(req); else io_req_complete_post(req, IO_URING_F_UNLOCKED); } /* * After the iocb has been issued, it's safe to be found on the poll list. * Adding the kiocb to the list AFTER submission ensures that we don't * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; /* workqueue context doesn't hold uring_lock, grab it now */ if (unlikely(needs_lock)) mutex_lock(&ctx->uring_lock); /* * Track whether we have multiple files in our lists. This will impact * how we do polling eventually, not spinning if we're on potentially * different devices. */ if (wq_list_empty(&ctx->iopoll_list)) { ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, comp_list); if (list_req->file != req->file) ctx->poll_multi_queue = true; } /* * For fast devices, IO may have already completed. If it has, add * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) wq_list_add_head(&req->comp_list, &ctx->iopoll_list); else wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); if (unlikely(needs_lock)) { /* * If IORING_SETUP_SQPOLL is enabled, sqes are either handle * in sq thread task context or in io worker task context. If * current task context is sq thread, we don't need to check * whether should wake up sq thread. */ if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); mutex_unlock(&ctx->uring_lock); } } unsigned int io_file_get_flags(struct file *file) { unsigned int res = 0; if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) res |= REQ_F_SUPPORT_NOWAIT; return res; } bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; } return true; } int io_req_prep_async(struct io_kiocb *req) { const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; const struct io_issue_def *def = &io_issue_defs[req->opcode]; /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) req->file = io_file_get_normal(req, req->cqe.fd); if (!cdef->prep_async) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; if (!def->manual_alloc) { if (io_alloc_async_data(req)) return -EAGAIN; } return cdef->prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; struct io_kiocb *cur; /* need original cached_sq_head, but it was increased for each req */ io_for_each_link(cur, req) seq--; return seq; } static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; u32 seq = io_get_sequence(req); /* Still need defer if there is pending req in defer list. */ spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); queue: ctx->drain_active = false; io_req_task_queue(req); return; } spin_unlock(&ctx->completion_lock); io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) { ret = -ENOMEM; io_req_defer_failed(req, ret); return; } spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); kfree(de); goto queue; } trace_io_uring_defer(req); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock(&ctx->completion_lock); } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, unsigned int issue_flags) { if (req->file || !def->needs_file) return true; if (req->flags & REQ_F_FIXED_FILE) req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); else req->file = io_file_get_normal(req, req->cqe.fd); return !!req->file; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; int ret; if (unlikely(!io_assign_file(req, def, issue_flags))) return -EBADF; if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); if (!def->audit_skip) audit_uring_entry(req->opcode); ret = def->issue(req, issue_flags); if (!def->audit_skip) audit_uring_exit(!ret, ret); if (creds) revert_creds(creds); if (ret == IOU_OK) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else io_req_complete_post(req, issue_flags); } else if (ret != IOU_ISSUE_SKIP_COMPLETE) return ret; /* If the op doesn't have a file, we're not polling for it */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) io_iopoll_req_issued(req, issue_flags); return 0; } int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) { io_tw_lock(req->ctx, ts); return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; if (req_ref_put_and_test(req)) { if (req->flags & IO_REQ_LINK_FLAGS) nxt = io_req_find_next(req); io_free_req(req); } return nxt ? &nxt->work : NULL; } void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); const struct io_issue_def *def = &io_issue_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else req_ref_get(req); io_arm_ltimeout(req); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { fail: io_req_task_queue_fail(req, err); return; } if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; work->flags |= IO_WQ_WORK_CANCEL; goto fail; } if (req->flags & REQ_F_FORCE_ASYNC) { bool opcode_poll = def->pollin || def->pollout; if (opcode_poll && file_can_poll(req->file)) { needs_poll = true; issue_flags |= IO_URING_F_NONBLOCK; } } do { ret = io_issue_sqe(req, issue_flags); if (ret != -EAGAIN) break; /* * If REQ_F_NOWAIT is set, then don't wait or retry with * poll. -EAGAIN is final for that case. */ if (req->flags & REQ_F_NOWAIT) break; /* * We can get EAGAIN for iopolled IO even though we're * forcing a sync submission from here, since we can't * wait for request slots on the block side. */ if (!needs_poll) { if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) break; if (io_wq_worker_stopped()) break; cond_resched(); continue; } if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) return; /* aborted or ready, in either case retry blocking */ needs_poll = false; issue_flags &= ~IO_URING_F_NONBLOCK; } while (1); /* avoid locking problems by failing it from a clean context */ if (ret < 0) io_req_task_queue_fail(req, ret); } inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_fixed_file *slot; struct file *file = NULL; io_ring_submit_lock(ctx, issue_flags); if (unlikely((unsigned int)fd >= ctx->nr_user_files)) goto out; fd = array_index_nospec(fd, ctx->nr_user_files); slot = io_fixed_file_slot(&ctx->file_table, fd); file = io_slot_file(slot); req->flags |= io_slot_flags(slot); io_req_set_rsrc_node(req, ctx, 0); out: io_ring_submit_unlock(ctx, issue_flags); return file; } struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); trace_io_uring_file_get(req, fd); /* we don't allow fixed io_uring files */ if (file && io_is_uring_fops(file)) io_req_track_inflight(req); return file; } static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { struct io_kiocb *linked_timeout; if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { io_req_defer_failed(req, ret); return; } linked_timeout = io_prep_linked_timeout(req); switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_kbuf_recycle(req, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: io_kbuf_recycle(req, 0); io_queue_iowq(req, NULL); break; case IO_APOLL_OK: break; } if (linked_timeout) io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { int ret; ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (likely(!ret)) io_arm_ltimeout(req); else io_queue_async(req, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { if (unlikely(req->flags & REQ_F_FAIL)) { /* * We don't submit, fail them all, for that replace hardlinks * with normal links. Extra REQ_F_LINK is tolerated. */ req->flags &= ~REQ_F_HARDLINK; req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { int ret = io_req_prep_async(req); if (unlikely(ret)) { io_req_defer_failed(req, ret); return; } if (unlikely(req->ctx->drain_active)) io_drain_req(req); else io_queue_iowq(req, NULL); } } /* * Check SQE restrictions (opcode and flags). * * Returns 'true' if SQE is allowed, 'false' otherwise. */ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; if ((sqe_flags & ctx->restrictions.sqe_flags_required) != ctx->restrictions.sqe_flags_required) return false; if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | ctx->restrictions.sqe_flags_required)) return false; return true; } static void io_init_req_drain(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *head = ctx->submit_state.link.head; ctx->drain_active = true; if (head) { /* * If we need to drain a request in the middle of a link, drain * the head request and the next request/link after the current * link. Considering sequential execution of links, * REQ_F_IO_DRAIN will be maintained for every request of our * link. */ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; ctx->drain_next = true; } } static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { const struct io_issue_def *def; unsigned int sqe_flags; int personality; u8 opcode; /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->rsrc_node = NULL; req->task = current; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; return -EINVAL; } def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) return -EINVAL; if (sqe_flags & IOSQE_BUFFER_SELECT) { if (!def->buffer_select) return -EOPNOTSUPP; req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) return -EOPNOTSUPP; io_init_req_drain(req); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) return -EACCES; /* knock it to the slow queue path, will be drained there */ if (ctx->drain_active) req->flags |= REQ_F_FORCE_ASYNC; /* if there is no link, we're at "next" request and need to drain */ if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { ctx->drain_next = false; ctx->drain_active = true; req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } if (!def->ioprio && sqe->ioprio) return -EINVAL; if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (def->needs_file) { struct io_submit_state *state = &ctx->submit_state; req->cqe.fd = READ_ONCE(sqe->fd); /* * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. */ if (state->need_plug && def->plug) { state->plug_started = true; state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); } } personality = READ_ONCE(sqe->personality); if (personality) { int ret; req->creds = xa_load(&ctx->personalities, personality); if (!req->creds) return -EINVAL; get_cred(req->creds); ret = security_uring_override_creds(req->creds); if (ret) { put_cred(req->creds); return ret; } req->flags |= REQ_F_CREDS; } return def->prep(req, sqe); } static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, struct io_kiocb *req, int ret) { struct io_ring_ctx *ctx = req->ctx; struct io_submit_link *link = &ctx->submit_state.link; struct io_kiocb *head = link->head; trace_io_uring_req_failed(sqe, req, ret); /* * Avoid breaking links in the middle as it renders links with SQPOLL * unusable. Instead of failing eagerly, continue assembling the link if * applicable and mark the head with REQ_F_FAIL. The link flushing code * should find the flag and handle the rest. */ req_fail_link_node(req, ret); if (head && !(head->flags & REQ_F_FAIL)) req_fail_link_node(head, -ECANCELED); if (!(req->flags & IO_REQ_LINK_FLAGS)) { if (head) { link->last->link = req; link->head = NULL; req = head; } io_queue_sqe_fallback(req); return ret; } if (head) link->last->link = req; else link->head = req; link->last = req; return 0; } static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); trace_io_uring_submit_req(req); /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { ret = io_req_prep_async(req); if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); trace_io_uring_link(req, link->head); link->last->link = req; link->last = req; if (req->flags & IO_REQ_LINK_FLAGS) return 0; /* last request of the link, flush it */ req = link->head; link->head = NULL; if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) goto fallback; } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { if (req->flags & IO_REQ_LINK_FLAGS) { link->head = req; link->last = req; } else { fallback: io_queue_sqe_fallback(req); } return 0; } io_queue_sqe(req); return 0; } /* * Batched submission is done, ensure local IO is flushed out. */ static void io_submit_state_end(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; if (unlikely(state->link.head)) io_queue_sqe_fallback(state->link.head); /* flush only after queuing links as they can generate completions */ io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); } /* * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { state->plug_started = false; state->need_plug = max_ios > 2; state->submit_nr = max_ios; /* set only head, no need to init link_last in advance */ state->link.head = NULL; } static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; /* * Ensure any loads from the SQEs are done at this point, * since once we write the new head, the application could * write new data to them. */ smp_store_release(&rings->sq.head, ctx->cached_sq_head); } /* * Fetch an sqe, if one is available. Note this returns a pointer to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { /* drop invalid entries */ spin_lock(&ctx->completion_lock); ctx->cq_extra--; spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; } } /* * The cached sq head (or cq tail) serves two purposes: * * 1) allows us to batch the cost of updating the user visible * head updates. * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ /* double index for 128-byte SQEs, twice as long */ if (ctx->flags & IORING_SETUP_SQE128) head <<= 1; *sqe = &ctx->sq_sqes[head]; return true; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); unsigned int left; int ret; if (unlikely(!entries)) return 0; /* make sure SQ entry isn't read before tail */ ret = left = min(nr, entries); io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); do { const struct io_uring_sqe *sqe; struct io_kiocb *req; if (unlikely(!io_alloc_req(ctx, &req))) break; if (unlikely(!io_get_sqe(ctx, &sqe))) { io_req_add_to_cache(req, ctx); break; } /* * Continue submitting even for sqe failure if the * ring was setup with IORING_SETUP_SUBMIT_ALL */ if (unlikely(io_submit_sqe(ctx, req, sqe)) && !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { left--; break; } } while (--left); if (unlikely(left)) { ret -= left; /* try again if it submitted nothing and can't allocate a req */ if (!ret && io_req_cache_empty(ctx)) ret = -EAGAIN; current->io_uring->cached_refs += left; } io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); return ret; } struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; ktime_t timeout; }; static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || !llist_empty(&ctx->work_llist); } static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { if (!llist_empty(&ctx->work_llist)) { __set_current_state(TASK_RUNNING); if (io_run_local_work(ctx) > 0) return 0; } if (io_run_task_work() > 0) return 0; if (task_sigpending(current)) return -EINTR; return 0; } static bool current_pending_io(void) { struct io_uring_task *tctx = current->io_uring; if (!tctx) return false; return percpu_counter_read_positive(&tctx->inflight); } /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { int io_wait, ret; if (unlikely(READ_ONCE(ctx->check_cq))) return 1; if (unlikely(!llist_empty(&ctx->work_llist))) return 1; if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) return 1; if (unlikely(task_sigpending(current))) return -EINTR; if (unlikely(io_should_wake(iowq))) return 0; /* * Mark us as being in io_wait if we have pending requests, so cpufreq * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ io_wait = current->in_iowait; if (current_pending_io()) current->in_iowait = 1; ret = 0; if (iowq->timeout == KTIME_MAX) schedule(); else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) ret = -ETIME; current->in_iowait = io_wait; return ret; } /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. */ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz, struct __kernel_timespec __user *uts) { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; int ret; if (!io_allowed_run_tw(ctx)) return -EEXIST; if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx); io_run_task_work(); io_cqring_overflow_flush(ctx); /* if user messes with these they will just get an early return */ if (__io_cqring_events_user(ctx) >= min_events) return 0; if (sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, sigsz); else #endif ret = set_user_sigmask(sig, sigsz); if (ret) return ret; } init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; iowq.timeout = KTIME_MAX; if (uts) { struct timespec64 ts; if (get_timespec64(&ts, uts)) return -EFAULT; iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } trace_io_uring_cqring_wait(ctx, min_events); do { unsigned long check_cq; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); } ret = io_cqring_wait_schedule(ctx, &iowq); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, 0); if (ret < 0) break; /* * Run task_work after scheduling and before io_should_wake(). * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ io_run_task_work(); if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx); check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { /* let the caller flush overflows, retry */ if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) io_cqring_do_overflow_flush(ctx); if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { ret = -EBADR; break; } } if (io_should_wake(&iowq)) { ret = 0; break; } cond_resched(); } while (1); if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } static void io_mem_free(void *ptr) { if (!ptr) return; folio_put(virt_to_folio(ptr)); } static void io_pages_free(struct page ***pages, int npages) { struct page **page_array; int i; if (!pages) return; page_array = *pages; if (!page_array) return; for (i = 0; i < npages; i++) unpin_user_page(page_array[i]); kvfree(page_array); *pages = NULL; } static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, unsigned long uaddr, size_t size) { struct page **page_array; unsigned int nr_pages; int ret, i; *npages = 0; if (uaddr & (PAGE_SIZE - 1) || !size) return ERR_PTR(-EINVAL); nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; if (nr_pages > USHRT_MAX) return ERR_PTR(-EINVAL); page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!page_array) return ERR_PTR(-ENOMEM); ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, page_array); if (ret != nr_pages) { err: io_pages_free(&page_array, ret > 0 ? ret : 0); return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); } /* * Should be a single page. If the ring is small enough that we can * use a normal page, that is fine. If we need multiple pages, then * userspace should use a huge page. That's the only way to guarantee * that we get contigious memory, outside of just being lucky or * (currently) having low memory fragmentation. */ if (page_array[0] != page_array[ret - 1]) goto err; /* * Can't support mapping user allocated ring memory on 32-bit archs * where it could potentially reside in highmem. Just fail those with * -EINVAL, just like we did on kernels that didn't support this * feature. */ for (i = 0; i < nr_pages; i++) { if (PageHighMem(page_array[i])) { ret = -EINVAL; goto err; } } *pages = page_array; *npages = nr_pages; return page_to_virt(page_array[0]); } static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, size_t size) { return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, size); } static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, size_t size) { return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, size); } static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); ctx->rings = NULL; ctx->sq_sqes = NULL; } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); ctx->n_sqe_pages = 0; } } static void *io_mem_alloc(size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; void *ret; ret = (void *) __get_free_pages(gfp, get_order(size)); if (ret) return ret; return ERR_PTR(-ENOMEM); } static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; if (ctx->flags & IORING_SETUP_CQE32) { if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); if (off == 0) return SIZE_MAX; #endif if (ctx->flags & IORING_SETUP_NO_SQARRAY) { if (sq_offset) *sq_offset = SIZE_MAX; return off; } if (sq_offset) *sq_offset = off; sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; return off; } static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async) { struct io_ev_fd *ev_fd; __s32 __user *fds = arg; int fd; ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) return -EBUSY; if (copy_from_user(&fd, fds, sizeof(*fds))) return -EFAULT; ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); if (!ev_fd) return -ENOMEM; ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); if (IS_ERR(ev_fd->cq_ev_fd)) { int ret = PTR_ERR(ev_fd->cq_ev_fd); kfree(ev_fd); return ret; } spin_lock(&ctx->completion_lock); ctx->evfd_last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; rcu_assign_pointer(ctx->io_ev_fd, ev_fd); atomic_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); return 0; } static int io_eventfd_unregister(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) call_rcu(&ev_fd->rcu, io_eventfd_ops); return 0; } return -ENXIO; } static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; mutex_lock(&ctx->uring_lock); io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); nr++; } if (nr) percpu_ref_put_many(&ctx->refs, nr); mutex_unlock(&ctx->uring_lock); } static void io_rsrc_node_cache_free(struct io_cache_entry *entry) { kfree(container_of(entry, struct io_rsrc_node, cache)); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) return; mutex_lock(&ctx->uring_lock); if (ctx->buf_data) __io_sqe_buffers_unregister(ctx); if (ctx->file_data) __io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); if (ctx->submitter_task) put_task_struct(ctx->submitter_task); /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) io_rsrc_node_destroy(ctx, ctx->rsrc_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { ctx->ring_sock->file = NULL; /* so that iput() is called */ sock_release(ctx->ring_sock); } #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } io_rings_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } static __cold void io_activate_pollwq_cb(struct callback_head *cb) { struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, poll_wq_task_work); mutex_lock(&ctx->uring_lock); ctx->poll_activated = true; mutex_unlock(&ctx->uring_lock); /* * Wake ups for some events between start of polling and activation * might've been lost due to loose synchronisation. */ wake_up_all(&ctx->poll_wq); percpu_ref_put(&ctx->refs); } static __cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ if (ctx->poll_activated || ctx->poll_wq_task_work.func) goto out; if (WARN_ON_ONCE(!ctx->task_complete)) goto out; if (!ctx->submitter_task) goto out; /* * with ->submitter_task only the submitter task completes requests, we * only need to sync with it, which is done by injecting a tw */ init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb); percpu_ref_get(&ctx->refs); if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL)) percpu_ref_put(&ctx->refs); out: spin_unlock(&ctx->completion_lock); } static __poll_t io_uring_poll(struct file *file, poll_table *wait) { struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; if (unlikely(!ctx->poll_activated)) io_activate_pollwq(ctx); poll_wait(file, &ctx->poll_wq, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring */ smp_rmb(); if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Don't flush cqring overflow list here, just do a simple check. * Otherwise there could possible be ABBA deadlock: * CPU0 CPU1 * ---- ---- * lock(&ctx->uring_lock); * lock(&ep->mtx); * lock(&ctx->uring_lock); * lock(&ep->mtx); * * Users may get EPOLLIN meanwhile seeing nothing in cqring, this * pushes them to do the flush. */ if (__io_cqring_events_user(ctx) || io_has_work(ctx)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { const struct cred *creds; creds = xa_erase(&ctx->personalities, id); if (creds) { put_cred(creds); return 0; } return -EINVAL; } struct io_tctx_exit { struct callback_head task_work; struct completion completion; struct io_ring_ctx *ctx; }; static __cold void io_tctx_exit_cb(struct callback_head *cb) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_exit *work; work = container_of(cb, struct io_tctx_exit, task_work); /* * When @in_cancel, we're in cancellation and it's racy to remove the * node. It'll be removed by the end of cancellation, just ignore it. * tctx can be NULL if the queueing of this task_work raced with * work cancelation off the exec path. */ if (tctx && !atomic_read(&tctx->in_cancel)) io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); return req->ctx == data; } static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; unsigned long interval = HZ / 20; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while * we're waiting for refs to drop. We need to reap these manually, * as nobody else will be looking for them. */ do { if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { mutex_lock(&ctx->uring_lock); io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); while (io_uring_try_cancel_requests(ctx, NULL, true)) cond_resched(); if (ctx->sq_data) { struct io_sq_data *sqd = ctx->sq_data; struct task_struct *tsk; io_sq_thread_park(sqd); tsk = sqd->thread; if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); io_sq_thread_unpark(sqd); } io_req_caches_free(ctx); if (WARN_ON_ONCE(time_after(jiffies, timeout))) { /* there is little hope left, don't run it too often */ interval = HZ * 60; } /* * This is really an uninterruptible wait, as it has to be * complete. But it's also run from a kworker, which doesn't * take signals, so it's fine to make it interruptible. This * avoids scenarios where we knowingly can wait much longer * on completions, for example if someone does a SIGSTOP on * a task that needs to finish task_work to make this loop * complete. That's a synthetic situation that should not * cause a stuck task backtrace, and hence a potential panic * on stuck tasks if that is enabled. */ } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); exit.ctx = ctx; /* * Some may use context even when all refs and requests have been put, * and they are free to do so while still holding uring_lock or * completion_lock, see io_req_task_submit(). Apart from other work, * this lock/unlock section also waits them to finish. */ mutex_lock(&ctx->uring_lock); while (!list_empty(&ctx->tctx_list)) { WARN_ON_ONCE(time_after(jiffies, timeout)); node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, ctx_node); /* don't spin on a single task if cancellation failed */ list_rotate_left(&ctx->tctx_list); ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); if (WARN_ON_ONCE(ret)) continue; mutex_unlock(&ctx->uring_lock); /* * See comment above for * wait_for_completion_interruptible_timeout() on why this * wait is marked as interruptible. */ wait_for_completion_interruptible(&exit.completion); mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); /* pairs with RCU read section in io_req_local_work_add() */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) synchronize_rcu(); io_ring_ctx_free(ctx); } static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; struct creds *creds; mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); if (ctx->rings) io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); /* * If we failed setting up the ctx, we might not have any rings * and therefore did not submit any requests */ if (ctx->rings) io_kill_timeouts(ctx, NULL, true); flush_delayed_work(&ctx->fallback_work); INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* * Use system_unbound_wq to avoid spawning tons of event kworkers * if we're exiting a ton of rings at the same time. It just adds * noise and overhead, there's no discernable change in runtime * over using system_wq. */ queue_work(system_unbound_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) { struct io_ring_ctx *ctx = file->private_data; file->private_data = NULL; io_ring_ctx_wait_and_kill(ctx); return 0; } struct io_task_cancel { struct task_struct *task; bool all; }; static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_task_cancel *cancel = data; return io_match_task_safe(req, cancel->task, cancel->all); } static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct io_defer_entry *de; LIST_HEAD(list); spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task_safe(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } return true; } static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) { struct io_tctx_node *node; enum io_wq_cancel cret; bool ret = false; mutex_lock(&ctx->uring_lock); list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; /* * io_wq will stay alive while we hold uring_lock, because it's * killed after ctx nodes, which requires to take the lock. */ if (!tctx || !tctx->io_wq) continue; cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } mutex_unlock(&ctx->uring_lock); return ret; } static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; bool ret = false; lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, hash_node) { struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct file *file = req->file; if (!cancel_all && req->task != task) continue; if (cmd->flags & IORING_URING_CMD_CANCELABLE) { /* ->sqe isn't available if no async data */ if (!req_has_async_data(req)) cmd->sqe = NULL; file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); ret = true; } } io_submit_flush_completions(ctx); return ret; } static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; enum io_wq_cancel cret; bool ret = false; /* set it so io_req_local_work_add() would wake us up */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 1); smp_mb(); } /* failed during ring init, it couldn't have issued any requests */ if (!ctx->rings) return false; if (!task) { ret |= io_uring_try_cancel_iowq(ctx); } else if (tctx && tctx->io_wq) { /* * Cancels requests of all rings, not only @ctx, but * it's fine as the task is in exit/exec. */ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, &cancel, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || (ctx->sq_data && ctx->sq_data->thread == current)) { while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; cond_resched(); } } if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx) > 0; ret |= io_cancel_defer_files(ctx, task, cancel_all); mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, task, cancel_all); ret |= io_waitid_remove_all(ctx, task, cancel_all); ret |= io_futex_remove_all(ctx, task, cancel_all); ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all); mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work() > 0; return ret; } static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { if (tracked) return atomic_read(&tctx->inflight_tracked); return percpu_counter_sum(&tctx->inflight); } /* * Find any io_uring ctx that this task has registered or done IO on, and cancel * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. */ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; struct io_tctx_node *node; unsigned long index; s64 inflight; DEFINE_WAIT(wait); WARN_ON_ONCE(sqd && sqd->thread != current); if (!current->io_uring) return; if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); atomic_inc(&tctx->in_cancel); do { bool loop = false; io_uring_drop_tctx_refs(current); /* read completions before cancelations */ inflight = tctx_inflight(tctx, !cancel_all); if (!inflight) break; if (!sqd) { xa_for_each(&tctx->xa, index, node) { /* sqpoll task will cancel all its requests */ if (node->ctx->sq_data) continue; loop |= io_uring_try_cancel_requests(node->ctx, current, cancel_all); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, current, cancel_all); } if (loop) { cond_resched(); continue; } prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); io_run_task_work(); io_uring_drop_tctx_refs(current); xa_for_each(&tctx->xa, index, node) { if (!llist_empty(&node->ctx->work_llist)) { WARN_ON_ONCE(node->ctx->submitter_task && node->ctx->submitter_task != current); goto end_wait; } } /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did * prepare_to_wait(). */ if (inflight == tctx_inflight(tctx, !cancel_all)) schedule(); end_wait: finish_wait(&tctx->wait, &wait); } while (1); io_uring_clean_tctx(tctx); if (cancel_all) { /* * We shouldn't run task_works after cancel, so just leave * ->in_cancel set for normal exit. */ atomic_dec(&tctx->in_cancel); /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } } void __io_uring_cancel(bool cancel_all) { io_uring_cancel_generic(cancel_all, NULL); } static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; struct page *page; void *ptr; /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: ptr = ctx->rings; break; case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; case IORING_OFF_PBUF_RING: { unsigned int bgid; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; mutex_lock(&ctx->uring_lock); ptr = io_pbuf_get_address(ctx, bgid); mutex_unlock(&ctx->uring_lock); if (!ptr) return ERR_PTR(-EINVAL); break; } default: return ERR_PTR(-EINVAL); } page = virt_to_head_page(ptr); if (sz > page_size(page)) return ERR_PTR(-EINVAL); return ptr; } #ifdef CONFIG_MMU static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { size_t sz = vma->vm_end - vma->vm_start; unsigned long pfn; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { void *ptr; /* * Do not allow to map to user-provided address to avoid breaking the * aliasing rules. Userspace is not able to guess the offset address of * kernel kmalloc()ed memory area. */ if (addr) return -EINVAL; ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) return -ENOMEM; /* * Some architectures have strong cache aliasing requirements. * For such architectures we need a coherent mapping which aliases * kernel memory *and* userspace memory. To achieve that: * - use a NULL file pointer to reference physical memory, and * - use the kernel virtual address of the shared io_uring context * (instead of the userspace-provided address, which has to be 0UL * anyway). * - use the same pgoff which the get_unmapped_area() uses to * calculate the page colouring. * For architectures without such aliasing requirements, the * architecture will return any suitable mapping because addr is 0. */ filp = NULL; flags |= MAP_SHARED; pgoff = 0; /* has been translated to ptr above */ #ifdef SHM_COLOUR addr = (uintptr_t) ptr; pgoff = addr >> PAGE_SHIFT; #else addr = 0UL; #endif return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; } static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) { return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; } static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { void *ptr; ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); return (unsigned long) ptr; } #endif /* !CONFIG_MMU */ static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) { if (flags & IORING_ENTER_EXT_ARG) { struct io_uring_getevents_arg arg; if (argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; } return 0; } static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, struct __kernel_timespec __user **ts, const sigset_t __user **sig) { struct io_uring_getevents_arg arg; /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { *sig = (const sigset_t __user *) argp; *ts = NULL; return 0; } /* * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */ if (*argsz != sizeof(arg)) return -EINVAL; if (copy_from_user(&arg, argp, sizeof(arg))) return -EFAULT; if (arg.pad) return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask); *argsz = arg.sigmask_sz; *ts = u64_to_user_ptr(arg.ts); return 0; } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const void __user *, argp, size_t, argsz) { struct io_ring_ctx *ctx; struct fd f; long ret; if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | IORING_ENTER_REGISTERED_RING))) return -EINVAL; /* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. */ if (flags & IORING_ENTER_REGISTERED_RING) { struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); f.file = tctx->registered_rings[fd]; f.flags = 0; if (unlikely(!f.file)) return -EBADF; } else { f = fdget(fd); if (unlikely(!f.file)) return -EBADF; ret = -EOPNOTSUPP; if (unlikely(!io_is_uring_fops(f.file))) goto out; } ctx = f.file->private_data; ret = -EBADFD; if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) goto out; /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if * we were asked to. */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx); if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) io_sqpoll_wait_sq(ctx); ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit); if (ret != to_submit) { mutex_unlock(&ctx->uring_lock); goto out; } if (flags & IORING_ENTER_GETEVENTS) { if (ctx->syscall_iopoll) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and * it should handle ownership problems if any. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) (void)io_run_local_work_locked(ctx); } mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { int ret2; if (ctx->syscall_iopoll) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to * prevent racing with polled issue that got punted to * a workqueue. */ mutex_lock(&ctx->uring_lock); iopoll_locked: ret2 = io_validate_ext_arg(flags, argp, argsz); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); ret2 = io_iopoll_check(ctx, min_complete); } mutex_unlock(&ctx->uring_lock); } else { const sigset_t __user *sig; struct __kernel_timespec __user *ts; ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries); ret2 = io_cqring_wait(ctx, min_complete, sig, argsz, ts); } } if (!ret) { ret = ret2; /* * EBADR indicates that one or more CQE were dropped. * Once the user has been informed we can clear the bit * as they are obviously ok with those drops. */ if (unlikely(ret2 == -EBADR)) clear_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); } } out: fdput(f); return ret; } static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, #ifndef CONFIG_MMU .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, #else .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = io_uring_show_fdinfo, #endif }; bool io_is_uring_fops(struct file *file) { return file->f_op == &io_uring_fops; } static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { struct io_rings *rings; size_t size, sq_array_offset; void *ptr; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) rings = io_mem_alloc(size); else rings = io_rings_map(ctx, p->cq_off.user_addr, size); if (IS_ERR(rings)) return PTR_ERR(rings); ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; if (p->flags & IORING_SETUP_SQE128) size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); else size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { io_rings_free(ctx); return -EOVERFLOW; } if (!(ctx->flags & IORING_SETUP_NO_MMAP)) ptr = io_mem_alloc(size); else ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); if (IS_ERR(ptr)) { io_rings_free(ctx); return PTR_ERR(ptr); } ctx->sq_sqes = ptr; return 0; } static int io_uring_install_fd(struct file *file) { int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; fd_install(fd, file); return fd; } /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, * we have to tie this fd to a socket for file garbage collection purposes. */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { struct file *file; #if defined(CONFIG_UNIX) int ret; ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, &ctx->ring_sock); if (ret) return ERR_PTR(ret); #endif file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC, NULL); #if defined(CONFIG_UNIX) if (IS_ERR(file)) { sock_release(ctx->ring_sock); ctx->ring_sock = NULL; } else { ctx->ring_sock->file = file; } #endif return file; } static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { struct io_ring_ctx *ctx; struct io_uring_task *tctx; struct file *file; int ret; if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { if (!(p->flags & IORING_SETUP_CLAMP)) return -EINVAL; entries = IORING_MAX_ENTRIES; } if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) && !(p->flags & IORING_SETUP_NO_MMAP)) return -EINVAL; /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, * since the sqes are only used at submission time. This allows for * some flexibility in overcommitting a bit. If the application has * set IORING_SETUP_CQSIZE, it will have passed in the desired number * of CQ ring entries manually. */ p->sq_entries = roundup_pow_of_two(entries); if (p->flags & IORING_SETUP_CQSIZE) { /* * If IORING_SETUP_CQSIZE is set, we do the same roundup * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ if (!p->cq_entries) return -EINVAL; if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { if (!(p->flags & IORING_SETUP_CLAMP)) return -EINVAL; p->cq_entries = IORING_MAX_CQ_ENTRIES; } p->cq_entries = roundup_pow_of_two(p->cq_entries); if (p->cq_entries < p->sq_entries) return -EINVAL; } else { p->cq_entries = 2 * p->sq_entries; } ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->task_complete = true; if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) ctx->lockless_cq = true; /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ if (!ctx->task_complete) ctx->poll_activated = true; /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events * polling again, they can rely on io_sq_thread to do polling * work, which can reduce cpu usage and uring_lock contention. */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->syscall_iopoll = 1; ctx->compat = in_compat_syscall(); if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); /* * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ ret = -EINVAL; if (ctx->flags & IORING_SETUP_SQPOLL) { /* IPI related flags don't make sense with SQPOLL */ if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_DEFER_TASKRUN)) goto err; ctx->notify_method = TWA_SIGNAL_NO_IPI; } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { ctx->notify_method = TWA_SIGNAL_NO_IPI; } else { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) goto err; ctx->notify_method = TWA_SIGNAL; } /* * For DEFER_TASKRUN we require the completion task to be the same as the * submission task. This implies that there is only one submitter, so enforce * that. */ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { goto err; } /* * This is just grabbed for accounting purposes. When a process exits, * the mm is exited and dropped before the files, hence we need to hang * on to this mm purely for the purposes of being able to unaccount * memory (locked/pinned vm). It's not used for anything else. */ mmgrab(current->mm); ctx->mm_account = current->mm; ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; ret = io_sq_offload_create(ctx, p); if (ret) goto err; ret = io_rsrc_init(ctx); if (ret) goto err; p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); p->sq_off.flags = offsetof(struct io_rings, sq_flags); p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; p->sq_off.resv1 = 0; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) p->sq_off.user_addr = 0; p->cq_off.head = offsetof(struct io_rings, cq.head); p->cq_off.tail = offsetof(struct io_rings, cq.tail); p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); p->cq_off.flags = offsetof(struct io_rings, cq_flags); p->cq_off.resv1 = 0; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) p->cq_off.user_addr = 0; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; goto err; } if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !(ctx->flags & IORING_SETUP_R_DISABLED)) WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); file = io_uring_get_file(ctx); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err; } ret = __io_uring_add_tctx_node(ctx); if (ret) goto err_fput; tctx = current->io_uring; /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); else ret = io_uring_install_fd(file); if (ret < 0) goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; err_fput: fput(file); return ret; } /* * Sets up an aio uring context, and returns the fd. Applications asks for a * ring size, we return the actual sq/cq ring sizes (among other things) in the * params structure passed in. */ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_uring_params p; int i; if (copy_from_user(&p, params, sizeof(p))) return -EFAULT; for (i = 0; i < ARRAY_SIZE(p.resv); i++) { if (p.resv[i]) return -EINVAL; } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | IORING_SETUP_NO_SQARRAY)) return -EINVAL; return io_uring_create(entries, &p, params); } static inline bool io_uring_allowed(void) { int disabled = READ_ONCE(sysctl_io_uring_disabled); kgid_t io_uring_group; if (disabled == 2) return false; if (disabled == 0 || capable(CAP_SYS_ADMIN)) return true; io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); if (!gid_valid(io_uring_group)) return false; return in_group_p(io_uring_group); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, struct io_uring_params __user *, params) { if (!io_uring_allowed()) return -EPERM; return io_uring_setup(entries, params); } static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_probe *p; size_t size; int i, ret; size = struct_size(p, ops, nr_args); if (size == SIZE_MAX) return -EOVERFLOW; p = kzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; ret = -EFAULT; if (copy_from_user(p, arg, size)) goto out; ret = -EINVAL; if (memchr_inv(p, 0, size)) goto out; p->last_op = IORING_OP_LAST - 1; if (nr_args > IORING_OP_LAST) nr_args = IORING_OP_LAST; for (i = 0; i < nr_args; i++) { p->ops[i].op = i; if (!io_issue_defs[i].not_supported) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; ret = 0; if (copy_to_user(arg, p, size)) ret = -EFAULT; out: kfree(p); return ret; } static int io_register_personality(struct io_ring_ctx *ctx) { const struct cred *creds; u32 id; int ret; creds = get_current_cred(); ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); if (ret < 0) { put_cred(creds); return ret; } return id; } static __cold int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args) { struct io_uring_restriction *res; size_t size; int i, ret; /* Restrictions allowed only if rings started disabled */ if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; /* We allow only a single restrictions registration */ if (ctx->restrictions.registered) return -EBUSY; if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; size = array_size(nr_args, sizeof(*res)); if (size == SIZE_MAX) return -EOVERFLOW; res = memdup_user(arg, size); if (IS_ERR(res)) return PTR_ERR(res); ret = 0; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: if (res[i].register_op >= IORING_REGISTER_LAST) { ret = -EINVAL; goto out; } __set_bit(res[i].register_op, ctx->restrictions.register_op); break; case IORING_RESTRICTION_SQE_OP: if (res[i].sqe_op >= IORING_OP_LAST) { ret = -EINVAL; goto out; } __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: ctx->restrictions.sqe_flags_required = res[i].sqe_flags; break; default: ret = -EINVAL; goto out; } } out: /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; kfree(res); return ret; } static int io_register_enable_rings(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); /* * Lazy activation attempts would fail if it was polled before * submitter_task is set. */ if (wq_has_sleeper(&ctx->poll_wq)) io_activate_pollwq(ctx); } if (ctx->restrictions.registered) ctx->restricted = 1; ctx->flags &= ~IORING_SETUP_R_DISABLED; if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); return 0; } static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, cpumask_var_t new_mask) { int ret; if (!(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_wq_cpu_affinity(current->io_uring, new_mask); } else { mutex_unlock(&ctx->uring_lock); ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); mutex_lock(&ctx->uring_lock); } return ret; } static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, unsigned len) { cpumask_var_t new_mask; int ret; if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) return -ENOMEM; cpumask_clear(new_mask); if (len > cpumask_size()) len = cpumask_size(); if (in_compat_syscall()) { ret = compat_get_bitmap(cpumask_bits(new_mask), (const compat_ulong_t __user *)arg, len * 8 /* CHAR_BIT */); } else { ret = copy_from_user(new_mask, arg, len); } if (ret) { free_cpumask_var(new_mask); return -EFAULT; } ret = __io_register_iowq_aff(ctx, new_mask); free_cpumask_var(new_mask); return ret; } static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) { return __io_register_iowq_aff(ctx, NULL); } static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, void __user *arg) __must_hold(&ctx->uring_lock) { struct io_tctx_node *node; struct io_uring_task *tctx = NULL; struct io_sq_data *sqd = NULL; __u32 new_count[2]; int i, ret; if (copy_from_user(new_count, arg, sizeof(new_count))) return -EFAULT; for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i] > INT_MAX) return -EINVAL; if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { /* * Observe the correct sqd->lock -> ctx->uring_lock * ordering. Fine to drop uring_lock here, we hold * a ref to the ctx. */ refcount_inc(&sqd->refs); mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); if (sqd->thread) tctx = sqd->thread->io_uring; } } else { tctx = current->io_uring; } BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i]) ctx->iowq_limits[i] = new_count[i]; ctx->iowq_limits_set = true; if (tctx && tctx->io_wq) { ret = io_wq_max_workers(tctx->io_wq, new_count); if (ret) goto err; } else { memset(new_count, 0, sizeof(new_count)); } if (sqd) { mutex_unlock(&sqd->lock); io_put_sq_data(sqd); } if (copy_to_user(arg, new_count, sizeof(new_count))) return -EFAULT; /* that's it for SQPOLL, only the SQPOLL task creates requests */ if (sqd) return 0; /* now propagate the restriction to all registered users */ list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) continue; for (i = 0; i < ARRAY_SIZE(new_count); i++) new_count[i] = ctx->iowq_limits[i]; /* ignore errors, it always returns zero anyway */ (void)io_wq_max_workers(tctx->io_wq, new_count); } return 0; err: if (sqd) { mutex_unlock(&sqd->lock); io_put_sq_data(sqd); } return ret; } static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) __acquires(ctx->uring_lock) { int ret; /* * We don't quiesce the refs for register anymore and so it can't be * dying as we're holding a file ref here. */ if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) return -ENXIO; if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; if (ctx->restricted) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; } switch (opcode) { case IORING_REGISTER_BUFFERS: ret = -EFAULT; if (!arg) break; ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_BUFFERS: ret = -EINVAL; if (arg || nr_args) break; ret = io_sqe_buffers_unregister(ctx); break; case IORING_REGISTER_FILES: ret = -EFAULT; if (!arg) break; ret = io_sqe_files_register(ctx, arg, nr_args, NULL); break; case IORING_UNREGISTER_FILES: ret = -EINVAL; if (arg || nr_args) break; ret = io_sqe_files_unregister(ctx); break; case IORING_REGISTER_FILES_UPDATE: ret = io_register_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg, 0); break; case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg, 1); break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; if (arg || nr_args) break; ret = io_eventfd_unregister(ctx); break; case IORING_REGISTER_PROBE: ret = -EINVAL; if (!arg || nr_args > 256) break; ret = io_probe(ctx, arg, nr_args); break; case IORING_REGISTER_PERSONALITY: ret = -EINVAL; if (arg || nr_args) break; ret = io_register_personality(ctx); break; case IORING_UNREGISTER_PERSONALITY: ret = -EINVAL; if (arg) break; ret = io_unregister_personality(ctx, nr_args); break; case IORING_REGISTER_ENABLE_RINGS: ret = -EINVAL; if (arg || nr_args) break; ret = io_register_enable_rings(ctx); break; case IORING_REGISTER_RESTRICTIONS: ret = io_register_restrictions(ctx, arg, nr_args); break; case IORING_REGISTER_FILES2: ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); break; case IORING_REGISTER_FILES_UPDATE2: ret = io_register_rsrc_update(ctx, arg, nr_args, IORING_RSRC_FILE); break; case IORING_REGISTER_BUFFERS2: ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; case IORING_REGISTER_BUFFERS_UPDATE: ret = io_register_rsrc_update(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; case IORING_REGISTER_IOWQ_AFF: ret = -EINVAL; if (!arg || !nr_args) break; ret = io_register_iowq_aff(ctx, arg, nr_args); break; case IORING_UNREGISTER_IOWQ_AFF: ret = -EINVAL; if (arg || nr_args) break; ret = io_unregister_iowq_aff(ctx); break; case IORING_REGISTER_IOWQ_MAX_WORKERS: ret = -EINVAL; if (!arg || nr_args != 2) break; ret = io_register_iowq_max_workers(ctx, arg); break; case IORING_REGISTER_RING_FDS: ret = io_ringfd_register(ctx, arg, nr_args); break; case IORING_UNREGISTER_RING_FDS: ret = io_ringfd_unregister(ctx, arg, nr_args); break; case IORING_REGISTER_PBUF_RING: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_register_pbuf_ring(ctx, arg); break; case IORING_UNREGISTER_PBUF_RING: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_unregister_pbuf_ring(ctx, arg); break; case IORING_REGISTER_SYNC_CANCEL: ret = -EINVAL; if (!arg || nr_args != 1) break; ret = io_sync_cancel(ctx, arg); break; case IORING_REGISTER_FILE_ALLOC_RANGE: ret = -EINVAL; if (!arg || nr_args) break; ret = io_register_file_alloc_range(ctx, arg); break; default: ret = -EINVAL; break; } return ret; } SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, void __user *, arg, unsigned int, nr_args) { struct io_ring_ctx *ctx; long ret = -EBADF; struct fd f; bool use_registered_ring; use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; if (opcode >= IORING_REGISTER_LAST) return -EINVAL; if (use_registered_ring) { /* * Ring fd has been registered via IORING_REGISTER_RING_FDS, we * need only dereference our task private array to find it. */ struct io_uring_task *tctx = current->io_uring; if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); f.file = tctx->registered_rings[fd]; f.flags = 0; if (unlikely(!f.file)) return -EBADF; } else { f = fdget(fd); if (unlikely(!f.file)) return -EBADF; ret = -EOPNOTSUPP; if (!io_is_uring_fops(f.file)) goto out_fput; } ctx = f.file->private_data; mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); out_fput: fdput(f); return ret; } static int __init io_uring_init(void) { #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ } while (0) #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename) #define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \ __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename) BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); BUILD_BUG_SQE_ELEM(0, __u8, opcode); BUILD_BUG_SQE_ELEM(1, __u8, flags); BUILD_BUG_SQE_ELEM(2, __u16, ioprio); BUILD_BUG_SQE_ELEM(4, __s32, fd); BUILD_BUG_SQE_ELEM(8, __u64, off); BUILD_BUG_SQE_ELEM(8, __u64, addr2); BUILD_BUG_SQE_ELEM(8, __u32, cmd_op); BUILD_BUG_SQE_ELEM(12, __u32, __pad1); BUILD_BUG_SQE_ELEM(16, __u64, addr); BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); BUILD_BUG_SQE_ELEM(24, __u32, len); BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); BUILD_BUG_SQE_ELEM(28, __u32, open_flags); BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(28, __u32, rename_flags); BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags); BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags); BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > sizeof(struct io_uring_rsrc_update2)); /* ->buf_index is u16 */ BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); /* top 8bits are for internal use */ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0); io_uring_optable_init(); /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling * requires copying in user memory into the io_kiocb object in that * range, and HARDENED_USERCOPY will complain if we haven't * correctly annotated this range. */ req_cachep = kmem_cache_create_usercopy("io_kiocb", sizeof(struct io_kiocb), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, offsetof(struct io_kiocb, cmd.data), sizeof_field(struct io_kiocb, cmd.data), NULL); io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); #endif return 0; }; __initcall(io_uring_init);
100 38 37 100 40 40 208 207 208 208 256 256 256 256 255 239 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. These functions are needed by GSO/GRO implementation. */ #include <linux/export.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/addrconf.h> #include <net/secure_seq.h> #include <linux/netfilter.h> static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { return get_random_u32_above(0); } /* This function exists only for tap drivers that must support broken * clients requesting UFO without specifying an IPv6 fragment ID. * * This is similar to ipv6_select_ident() but we use an independent hash * seed to limit information leakage. * * The network header must be set before calling this. */ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) { struct in6_addr buf[2]; struct in6_addr *addrs; u32 id; addrs = skb_header_pointer(skb, skb_network_offset(skb) + offsetof(struct ipv6hdr, saddr), sizeof(buf), buf); if (!addrs) return 0; id = __ipv6_select_ident(net, &addrs[1], &addrs[0]); return htonl(id); } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) { u32 id; id = __ipv6_select_ident(net, daddr, saddr); return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { unsigned int offset = sizeof(struct ipv6hdr); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { case NEXTHDR_HOP: break; case NEXTHDR_ROUTING: found_rhdr = 1; break; case NEXTHDR_DEST: #if IS_ENABLED(CONFIG_IPV6_MIP6) if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) break; #endif if (found_rhdr) return offset; break; default: return offset; } if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) return -EINVAL; exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); offset += ipv6_optlen(exthdr); if (offset > IPV6_MAXPLEN) return -EINVAL; *nexthdr = &exthdr->nexthdr; } return -EINVAL; } EXPORT_SYMBOL(ip6_find_1stfragopt); #if IS_ENABLED(CONFIG_IPV6) int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); if (hoplimit == 0) { struct net_device *dev = dst->dev; struct inet6_dev *idev; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) hoplimit = idev->cnf.hop_limit; else hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; rcu_read_unlock(); } return hoplimit; } EXPORT_SYMBOL(ip6_dst_hoplimit); #endif int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int len; len = skb->len - sizeof(struct ipv6hdr); if (len > IPV6_MAXPLEN) len = 0; ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out(sk, skb); if (unlikely(!skb)) return 0; skb->protocol = htons(ETH_P_IPV6); return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = __ip6_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } EXPORT_SYMBOL_GPL(ip6_local_out);
1476 1476 4 4 4 1 1 7 1 1 6 6 4 5 1 1 4 4 4 4 4 1 4 4 4 4 1 1 1 8 8 8 8 8 8 40 40 5 5 40 38 38 117 117 2 93 93 32 32 32 8 1 4 4 4 4 2 1 1 2 1 1 27 27 27 27 27 27 27 27 6 1 1 3 1 1 1 1 10 10 10 10 10 10 10 10 10 10 1476 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/team/team.c - Network team device driver * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #include <linux/ethtool.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/errno.h> #include <linux/ctype.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netpoll.h> #include <linux/if_vlan.h> #include <linux/if_arp.h> #include <linux/socket.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <net/genetlink.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <generated/utsrelease.h> #include <linux/if_team.h> #define DRV_NAME "team" /********** * Helpers **********/ static struct team_port *team_port_get_rtnl(const struct net_device *dev) { struct team_port *port = rtnl_dereference(dev->rx_handler_data); return netif_is_team_port(dev) ? port : NULL; } /* * Since the ability to change device address for open port device is tested in * team_port_add, this function can be called without control of return value */ static int __set_port_dev_addr(struct net_device *port_dev, const unsigned char *dev_addr) { struct sockaddr_storage addr; memcpy(addr.__data, dev_addr, port_dev->addr_len); addr.ss_family = port_dev->type; return dev_set_mac_address(port_dev, (struct sockaddr *)&addr, NULL); } static int team_port_set_orig_dev_addr(struct team_port *port) { return __set_port_dev_addr(port->dev, port->orig.dev_addr); } static int team_port_set_team_dev_addr(struct team *team, struct team_port *port) { return __set_port_dev_addr(port->dev, team->dev->dev_addr); } int team_modeop_port_enter(struct team *team, struct team_port *port) { return team_port_set_team_dev_addr(team, port); } EXPORT_SYMBOL(team_modeop_port_enter); void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port) { team_port_set_team_dev_addr(team, port); } EXPORT_SYMBOL(team_modeop_port_change_dev_addr); static void team_lower_state_changed(struct team_port *port) { struct netdev_lag_lower_state_info info; info.link_up = port->linkup; info.tx_enabled = team_port_enabled(port); netdev_lower_state_changed(port->dev, &info); } static void team_refresh_port_linkup(struct team_port *port) { bool new_linkup = port->user.linkup_enabled ? port->user.linkup : port->state.linkup; if (port->linkup != new_linkup) { port->linkup = new_linkup; team_lower_state_changed(port); } } /******************* * Options handling *******************/ struct team_option_inst { /* One for each option instance */ struct list_head list; struct list_head tmp_list; struct team_option *option; struct team_option_inst_info info; bool changed; bool removed; }; static struct team_option *__team_find_option(struct team *team, const char *opt_name) { struct team_option *option; list_for_each_entry(option, &team->option_list, list) { if (strcmp(option->name, opt_name) == 0) return option; } return NULL; } static void __team_option_inst_del(struct team_option_inst *opt_inst) { list_del(&opt_inst->list); kfree(opt_inst); } static void __team_option_inst_del_option(struct team *team, struct team_option *option) { struct team_option_inst *opt_inst, *tmp; list_for_each_entry_safe(opt_inst, tmp, &team->option_inst_list, list) { if (opt_inst->option == option) __team_option_inst_del(opt_inst); } } static int __team_option_inst_add(struct team *team, struct team_option *option, struct team_port *port) { struct team_option_inst *opt_inst; unsigned int array_size; unsigned int i; array_size = option->array_size; if (!array_size) array_size = 1; /* No array but still need one instance */ for (i = 0; i < array_size; i++) { opt_inst = kmalloc(sizeof(*opt_inst), GFP_KERNEL); if (!opt_inst) return -ENOMEM; opt_inst->option = option; opt_inst->info.port = port; opt_inst->info.array_index = i; opt_inst->changed = true; opt_inst->removed = false; list_add_tail(&opt_inst->list, &team->option_inst_list); if (option->init) option->init(team, &opt_inst->info); } return 0; } static int __team_option_inst_add_option(struct team *team, struct team_option *option) { int err; if (!option->per_port) { err = __team_option_inst_add(team, option, NULL); if (err) goto inst_del_option; } return 0; inst_del_option: __team_option_inst_del_option(team, option); return err; } static void __team_option_inst_mark_removed_option(struct team *team, struct team_option *option) { struct team_option_inst *opt_inst; list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->option == option) { opt_inst->changed = true; opt_inst->removed = true; } } } static void __team_option_inst_del_port(struct team *team, struct team_port *port) { struct team_option_inst *opt_inst, *tmp; list_for_each_entry_safe(opt_inst, tmp, &team->option_inst_list, list) { if (opt_inst->option->per_port && opt_inst->info.port == port) __team_option_inst_del(opt_inst); } } static int __team_option_inst_add_port(struct team *team, struct team_port *port) { struct team_option *option; int err; list_for_each_entry(option, &team->option_list, list) { if (!option->per_port) continue; err = __team_option_inst_add(team, option, port); if (err) goto inst_del_port; } return 0; inst_del_port: __team_option_inst_del_port(team, port); return err; } static void __team_option_inst_mark_removed_port(struct team *team, struct team_port *port) { struct team_option_inst *opt_inst; list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->info.port == port) { opt_inst->changed = true; opt_inst->removed = true; } } } static int __team_options_register(struct team *team, const struct team_option *option, size_t option_count) { int i; struct team_option **dst_opts; int err; dst_opts = kcalloc(option_count, sizeof(struct team_option *), GFP_KERNEL); if (!dst_opts) return -ENOMEM; for (i = 0; i < option_count; i++, option++) { if (__team_find_option(team, option->name)) { err = -EEXIST; goto alloc_rollback; } dst_opts[i] = kmemdup(option, sizeof(*option), GFP_KERNEL); if (!dst_opts[i]) { err = -ENOMEM; goto alloc_rollback; } } for (i = 0; i < option_count; i++) { err = __team_option_inst_add_option(team, dst_opts[i]); if (err) goto inst_rollback; list_add_tail(&dst_opts[i]->list, &team->option_list); } kfree(dst_opts); return 0; inst_rollback: for (i--; i >= 0; i--) __team_option_inst_del_option(team, dst_opts[i]); i = option_count; alloc_rollback: for (i--; i >= 0; i--) kfree(dst_opts[i]); kfree(dst_opts); return err; } static void __team_options_mark_removed(struct team *team, const struct team_option *option, size_t option_count) { int i; for (i = 0; i < option_count; i++, option++) { struct team_option *del_opt; del_opt = __team_find_option(team, option->name); if (del_opt) __team_option_inst_mark_removed_option(team, del_opt); } } static void __team_options_unregister(struct team *team, const struct team_option *option, size_t option_count) { int i; for (i = 0; i < option_count; i++, option++) { struct team_option *del_opt; del_opt = __team_find_option(team, option->name); if (del_opt) { __team_option_inst_del_option(team, del_opt); list_del(&del_opt->list); kfree(del_opt); } } } static void __team_options_change_check(struct team *team); int team_options_register(struct team *team, const struct team_option *option, size_t option_count) { int err; err = __team_options_register(team, option, option_count); if (err) return err; __team_options_change_check(team); return 0; } EXPORT_SYMBOL(team_options_register); void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count) { __team_options_mark_removed(team, option, option_count); __team_options_change_check(team); __team_options_unregister(team, option, option_count); } EXPORT_SYMBOL(team_options_unregister); static int team_option_get(struct team *team, struct team_option_inst *opt_inst, struct team_gsetter_ctx *ctx) { if (!opt_inst->option->getter) return -EOPNOTSUPP; opt_inst->option->getter(team, ctx); return 0; } static int team_option_set(struct team *team, struct team_option_inst *opt_inst, struct team_gsetter_ctx *ctx) { if (!opt_inst->option->setter) return -EOPNOTSUPP; return opt_inst->option->setter(team, ctx); } void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info) { struct team_option_inst *opt_inst; opt_inst = container_of(opt_inst_info, struct team_option_inst, info); opt_inst->changed = true; } EXPORT_SYMBOL(team_option_inst_set_change); void team_options_change_check(struct team *team) { __team_options_change_check(team); } EXPORT_SYMBOL(team_options_change_check); /**************** * Mode handling ****************/ static LIST_HEAD(mode_list); static DEFINE_SPINLOCK(mode_list_lock); struct team_mode_item { struct list_head list; const struct team_mode *mode; }; static struct team_mode_item *__find_mode(const char *kind) { struct team_mode_item *mitem; list_for_each_entry(mitem, &mode_list, list) { if (strcmp(mitem->mode->kind, kind) == 0) return mitem; } return NULL; } static bool is_good_mode_name(const char *name) { while (*name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_') return false; name++; } return true; } int team_mode_register(const struct team_mode *mode) { int err = 0; struct team_mode_item *mitem; if (!is_good_mode_name(mode->kind) || mode->priv_size > TEAM_MODE_PRIV_SIZE) return -EINVAL; mitem = kmalloc(sizeof(*mitem), GFP_KERNEL); if (!mitem) return -ENOMEM; spin_lock(&mode_list_lock); if (__find_mode(mode->kind)) { err = -EEXIST; kfree(mitem); goto unlock; } mitem->mode = mode; list_add_tail(&mitem->list, &mode_list); unlock: spin_unlock(&mode_list_lock); return err; } EXPORT_SYMBOL(team_mode_register); void team_mode_unregister(const struct team_mode *mode) { struct team_mode_item *mitem; spin_lock(&mode_list_lock); mitem = __find_mode(mode->kind); if (mitem) { list_del_init(&mitem->list); kfree(mitem); } spin_unlock(&mode_list_lock); } EXPORT_SYMBOL(team_mode_unregister); static const struct team_mode *team_mode_get(const char *kind) { struct team_mode_item *mitem; const struct team_mode *mode = NULL; if (!try_module_get(THIS_MODULE)) return NULL; spin_lock(&mode_list_lock); mitem = __find_mode(kind); if (!mitem) { spin_unlock(&mode_list_lock); request_module("team-mode-%s", kind); spin_lock(&mode_list_lock); mitem = __find_mode(kind); } if (mitem) { mode = mitem->mode; if (!try_module_get(mode->owner)) mode = NULL; } spin_unlock(&mode_list_lock); module_put(THIS_MODULE); return mode; } static void team_mode_put(const struct team_mode *mode) { module_put(mode->owner); } static bool team_dummy_transmit(struct team *team, struct sk_buff *skb) { dev_kfree_skb_any(skb); return false; } static rx_handler_result_t team_dummy_receive(struct team *team, struct team_port *port, struct sk_buff *skb) { return RX_HANDLER_ANOTHER; } static const struct team_mode __team_no_mode = { .kind = "*NOMODE*", }; static bool team_is_mode_set(struct team *team) { return team->mode != &__team_no_mode; } static void team_set_no_mode(struct team *team) { team->user_carrier_enabled = false; team->mode = &__team_no_mode; } static void team_adjust_ops(struct team *team) { /* * To avoid checks in rx/tx skb paths, ensure here that non-null and * correct ops are always set. */ if (!team->en_port_count || !team_is_mode_set(team) || !team->mode->ops->transmit) team->ops.transmit = team_dummy_transmit; else team->ops.transmit = team->mode->ops->transmit; if (!team->en_port_count || !team_is_mode_set(team) || !team->mode->ops->receive) team->ops.receive = team_dummy_receive; else team->ops.receive = team->mode->ops->receive; } /* * We can benefit from the fact that it's ensured no port is present * at the time of mode change. Therefore no packets are in fly so there's no * need to set mode operations in any special way. */ static int __team_change_mode(struct team *team, const struct team_mode *new_mode) { /* Check if mode was previously set and do cleanup if so */ if (team_is_mode_set(team)) { void (*exit_op)(struct team *team) = team->ops.exit; /* Clear ops area so no callback is called any longer */ memset(&team->ops, 0, sizeof(struct team_mode_ops)); team_adjust_ops(team); if (exit_op) exit_op(team); team_mode_put(team->mode); team_set_no_mode(team); /* zero private data area */ memset(&team->mode_priv, 0, sizeof(struct team) - offsetof(struct team, mode_priv)); } if (!new_mode) return 0; if (new_mode->ops->init) { int err; err = new_mode->ops->init(team); if (err) return err; } team->mode = new_mode; memcpy(&team->ops, new_mode->ops, sizeof(struct team_mode_ops)); team_adjust_ops(team); return 0; } static int team_change_mode(struct team *team, const char *kind) { const struct team_mode *new_mode; struct net_device *dev = team->dev; int err; if (!list_empty(&team->port_list)) { netdev_err(dev, "No ports can be present during mode change\n"); return -EBUSY; } if (team_is_mode_set(team) && strcmp(team->mode->kind, kind) == 0) { netdev_err(dev, "Unable to change to the same mode the team is in\n"); return -EINVAL; } new_mode = team_mode_get(kind); if (!new_mode) { netdev_err(dev, "Mode \"%s\" not found\n", kind); return -EINVAL; } err = __team_change_mode(team, new_mode); if (err) { netdev_err(dev, "Failed to change to mode \"%s\"\n", kind); team_mode_put(new_mode); return err; } netdev_info(dev, "Mode changed to \"%s\"\n", kind); return 0; } /********************* * Peers notification *********************/ static void team_notify_peers_work(struct work_struct *work) { struct team *team; int val; team = container_of(work, struct team, notify_peers.dw.work); if (!rtnl_trylock()) { schedule_delayed_work(&team->notify_peers.dw, 0); return; } val = atomic_dec_if_positive(&team->notify_peers.count_pending); if (val < 0) { rtnl_unlock(); return; } call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, team->dev); rtnl_unlock(); if (val) schedule_delayed_work(&team->notify_peers.dw, msecs_to_jiffies(team->notify_peers.interval)); } static void team_notify_peers(struct team *team) { if (!team->notify_peers.count || !netif_running(team->dev)) return; atomic_add(team->notify_peers.count, &team->notify_peers.count_pending); schedule_delayed_work(&team->notify_peers.dw, 0); } static void team_notify_peers_init(struct team *team) { INIT_DELAYED_WORK(&team->notify_peers.dw, team_notify_peers_work); } static void team_notify_peers_fini(struct team *team) { cancel_delayed_work_sync(&team->notify_peers.dw); } /******************************* * Send multicast group rejoins *******************************/ static void team_mcast_rejoin_work(struct work_struct *work) { struct team *team; int val; team = container_of(work, struct team, mcast_rejoin.dw.work); if (!rtnl_trylock()) { schedule_delayed_work(&team->mcast_rejoin.dw, 0); return; } val = atomic_dec_if_positive(&team->mcast_rejoin.count_pending); if (val < 0) { rtnl_unlock(); return; } call_netdevice_notifiers(NETDEV_RESEND_IGMP, team->dev); rtnl_unlock(); if (val) schedule_delayed_work(&team->mcast_rejoin.dw, msecs_to_jiffies(team->mcast_rejoin.interval)); } static void team_mcast_rejoin(struct team *team) { if (!team->mcast_rejoin.count || !netif_running(team->dev)) return; atomic_add(team->mcast_rejoin.count, &team->mcast_rejoin.count_pending); schedule_delayed_work(&team->mcast_rejoin.dw, 0); } static void team_mcast_rejoin_init(struct team *team) { INIT_DELAYED_WORK(&team->mcast_rejoin.dw, team_mcast_rejoin_work); } static void team_mcast_rejoin_fini(struct team *team) { cancel_delayed_work_sync(&team->mcast_rejoin.dw); } /************************ * Rx path frame handler ************************/ /* note: already called with rcu_read_lock */ static rx_handler_result_t team_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct team_port *port; struct team *team; rx_handler_result_t res; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; port = team_port_get_rcu(skb->dev); team = port->team; if (!team_port_enabled(port)) { if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) /* link-local packets are mostly useful when stack receives them * with the link they arrive on. */ return RX_HANDLER_PASS; /* allow exact match delivery for disabled ports */ res = RX_HANDLER_EXACT; } else { res = team->ops.receive(team, port, skb); } if (res == RX_HANDLER_ANOTHER) { struct team_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(team->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->rx_packets); u64_stats_add(&pcpu_stats->rx_bytes, skb->len); if (skb->pkt_type == PACKET_MULTICAST) u64_stats_inc(&pcpu_stats->rx_multicast); u64_stats_update_end(&pcpu_stats->syncp); skb->dev = team->dev; } else if (res == RX_HANDLER_EXACT) { this_cpu_inc(team->pcpu_stats->rx_nohandler); } else { this_cpu_inc(team->pcpu_stats->rx_dropped); } return res; } /************************************* * Multiqueue Tx port select override *************************************/ static int team_queue_override_init(struct team *team) { struct list_head *listarr; unsigned int queue_cnt = team->dev->num_tx_queues - 1; unsigned int i; if (!queue_cnt) return 0; listarr = kmalloc_array(queue_cnt, sizeof(struct list_head), GFP_KERNEL); if (!listarr) return -ENOMEM; team->qom_lists = listarr; for (i = 0; i < queue_cnt; i++) INIT_LIST_HEAD(listarr++); return 0; } static void team_queue_override_fini(struct team *team) { kfree(team->qom_lists); } static struct list_head *__team_get_qom_list(struct team *team, u16 queue_id) { return &team->qom_lists[queue_id - 1]; } /* * note: already called with rcu_read_lock */ static bool team_queue_override_transmit(struct team *team, struct sk_buff *skb) { struct list_head *qom_list; struct team_port *port; if (!team->queue_override_enabled || !skb->queue_mapping) return false; qom_list = __team_get_qom_list(team, skb->queue_mapping); list_for_each_entry_rcu(port, qom_list, qom_list) { if (!team_dev_queue_xmit(team, port, skb)) return true; } return false; } static void __team_queue_override_port_del(struct team *team, struct team_port *port) { if (!port->queue_id) return; list_del_rcu(&port->qom_list); } static bool team_queue_override_port_has_gt_prio_than(struct team_port *port, struct team_port *cur) { if (port->priority < cur->priority) return true; if (port->priority > cur->priority) return false; if (port->index < cur->index) return true; return false; } static void __team_queue_override_port_add(struct team *team, struct team_port *port) { struct team_port *cur; struct list_head *qom_list; struct list_head *node; if (!port->queue_id) return; qom_list = __team_get_qom_list(team, port->queue_id); node = qom_list; list_for_each_entry(cur, qom_list, qom_list) { if (team_queue_override_port_has_gt_prio_than(port, cur)) break; node = &cur->qom_list; } list_add_tail_rcu(&port->qom_list, node); } static void __team_queue_override_enabled_check(struct team *team) { struct team_port *port; bool enabled = false; list_for_each_entry(port, &team->port_list, list) { if (port->queue_id) { enabled = true; break; } } if (enabled == team->queue_override_enabled) return; netdev_dbg(team->dev, "%s queue override\n", enabled ? "Enabling" : "Disabling"); team->queue_override_enabled = enabled; } static void team_queue_override_port_prio_changed(struct team *team, struct team_port *port) { if (!port->queue_id || team_port_enabled(port)) return; __team_queue_override_port_del(team, port); __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } static void team_queue_override_port_change_queue_id(struct team *team, struct team_port *port, u16 new_queue_id) { if (team_port_enabled(port)) { __team_queue_override_port_del(team, port); port->queue_id = new_queue_id; __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } else { port->queue_id = new_queue_id; } } static void team_queue_override_port_add(struct team *team, struct team_port *port) { __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } static void team_queue_override_port_del(struct team *team, struct team_port *port) { __team_queue_override_port_del(team, port); __team_queue_override_enabled_check(team); } /**************** * Port handling ****************/ static bool team_port_find(const struct team *team, const struct team_port *port) { struct team_port *cur; list_for_each_entry(cur, &team->port_list, list) if (cur == port) return true; return false; } /* * Enable/disable port by adding to enabled port hashlist and setting * port->index (Might be racy so reader could see incorrect ifindex when * processing a flying packet, but that is not a problem). Write guarded * by team->lock. */ static void team_port_enable(struct team *team, struct team_port *port) { if (team_port_enabled(port)) return; port->index = team->en_port_count++; hlist_add_head_rcu(&port->hlist, team_port_index_hash(team, port->index)); team_adjust_ops(team); team_queue_override_port_add(team, port); if (team->ops.port_enabled) team->ops.port_enabled(team, port); team_notify_peers(team); team_mcast_rejoin(team); team_lower_state_changed(port); } static void __reconstruct_port_hlist(struct team *team, int rm_index) { int i; struct team_port *port; for (i = rm_index + 1; i < team->en_port_count; i++) { port = team_get_port_by_index(team, i); hlist_del_rcu(&port->hlist); port->index--; hlist_add_head_rcu(&port->hlist, team_port_index_hash(team, port->index)); } } static void team_port_disable(struct team *team, struct team_port *port) { if (!team_port_enabled(port)) return; if (team->ops.port_disabled) team->ops.port_disabled(team, port); hlist_del_rcu(&port->hlist); __reconstruct_port_hlist(team, port->index); port->index = -1; team->en_port_count--; team_queue_override_port_del(team, port); team_adjust_ops(team); team_lower_state_changed(port); } #define TEAM_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ NETIF_F_HIGHDMA | NETIF_F_LRO) #define TEAM_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE) static void __team_compute_features(struct team *team) { struct team_port *port; netdev_features_t vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL; netdev_features_t enc_features = TEAM_ENC_FEATURES; unsigned short max_hard_header_len = ETH_HLEN; unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, port->dev->vlan_features, TEAM_VLAN_FEATURES); enc_features = netdev_increment_features(enc_features, port->dev->hw_enc_features, TEAM_ENC_FEATURES); dst_release_flag &= port->dev->priv_flags; if (port->dev->hard_header_len > max_hard_header_len) max_hard_header_len = port->dev->hard_header_len; } rcu_read_unlock(); team->dev->vlan_features = vlan_features; team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; team->dev->hard_header_len = max_hard_header_len; team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) team->dev->priv_flags |= IFF_XMIT_DST_RELEASE; } static void team_compute_features(struct team *team) { __team_compute_features(team); netdev_change_features(team->dev); } static int team_port_enter(struct team *team, struct team_port *port) { int err = 0; dev_hold(team->dev); if (team->ops.port_enter) { err = team->ops.port_enter(team, port); if (err) { netdev_err(team->dev, "Device %s failed to enter team mode\n", port->dev->name); goto err_port_enter; } } return 0; err_port_enter: dev_put(team->dev); return err; } static void team_port_leave(struct team *team, struct team_port *port) { if (team->ops.port_leave) team->ops.port_leave(team, port); dev_put(team->dev); } #ifdef CONFIG_NET_POLL_CONTROLLER static int __team_port_enable_netpoll(struct team_port *port) { struct netpoll *np; int err; np = kzalloc(sizeof(*np), GFP_KERNEL); if (!np) return -ENOMEM; err = __netpoll_setup(np, port->dev); if (err) { kfree(np); return err; } port->np = np; return err; } static int team_port_enable_netpoll(struct team_port *port) { if (!port->team->dev->npinfo) return 0; return __team_port_enable_netpoll(port); } static void team_port_disable_netpoll(struct team_port *port) { struct netpoll *np = port->np; if (!np) return; port->np = NULL; __netpoll_free(np); } #else static int team_port_enable_netpoll(struct team_port *port) { return 0; } static void team_port_disable_netpoll(struct team_port *port) { } #endif static int team_upper_dev_link(struct team *team, struct team_port *port, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; int err; lag_upper_info.tx_type = team->mode->lag_tx_type; lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; err = netdev_master_upper_dev_link(port->dev, team->dev, NULL, &lag_upper_info, extack); if (err) return err; port->dev->priv_flags |= IFF_TEAM_PORT; return 0; } static void team_upper_dev_unlink(struct team *team, struct team_port *port) { netdev_upper_dev_unlink(port->dev, team->dev); port->dev->priv_flags &= ~IFF_TEAM_PORT; } static void __team_port_change_port_added(struct team_port *port, bool linkup); static int team_dev_type_check_change(struct net_device *dev, struct net_device *port_dev); static int team_port_add(struct team *team, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct net_device *dev = team->dev; struct team_port *port; char *portname = port_dev->name; int err; if (port_dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "Loopback device can't be added as a team port"); netdev_err(dev, "Device %s is loopback device. Loopback devices can't be added as a team port\n", portname); return -EINVAL; } if (netif_is_team_port(port_dev)) { NL_SET_ERR_MSG(extack, "Device is already a port of a team device"); netdev_err(dev, "Device %s is already a port " "of a team device\n", portname); return -EBUSY; } if (dev == port_dev) { NL_SET_ERR_MSG(extack, "Cannot enslave team device to itself"); netdev_err(dev, "Cannot enslave team device to itself\n"); return -EINVAL; } if (netdev_has_upper_dev(dev, port_dev)) { NL_SET_ERR_MSG(extack, "Device is already an upper device of the team interface"); netdev_err(dev, "Device %s is already an upper device of the team interface\n", portname); return -EBUSY; } if (port_dev->features & NETIF_F_VLAN_CHALLENGED && vlan_uses_dev(dev)) { NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up"); netdev_err(dev, "Device %s is VLAN challenged and team device has VLAN set up\n", portname); return -EPERM; } err = team_dev_type_check_change(dev, port_dev); if (err) return err; if (port_dev->flags & IFF_UP) { NL_SET_ERR_MSG(extack, "Device is up. Set it down before adding it as a team port"); netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n", portname); return -EBUSY; } port = kzalloc(sizeof(struct team_port) + team->mode->port_priv_size, GFP_KERNEL); if (!port) return -ENOMEM; port->dev = port_dev; port->team = team; INIT_LIST_HEAD(&port->qom_list); port->orig.mtu = port_dev->mtu; err = dev_set_mtu(port_dev, dev->mtu); if (err) { netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err); goto err_set_mtu; } memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len); err = team_port_enter(team, port); if (err) { netdev_err(dev, "Device %s failed to enter team mode\n", portname); goto err_port_enter; } err = dev_open(port_dev, extack); if (err) { netdev_dbg(dev, "Device %s opening failed\n", portname); goto err_dev_open; } err = vlan_vids_add_by_dev(port_dev, dev); if (err) { netdev_err(dev, "Failed to add vlan ids to device %s\n", portname); goto err_vids_add; } err = team_port_enable_netpoll(port); if (err) { netdev_err(dev, "Failed to enable netpoll on device %s\n", portname); goto err_enable_netpoll; } if (!(dev->features & NETIF_F_LRO)) dev_disable_lro(port_dev); err = netdev_rx_handler_register(port_dev, team_handle_frame, port); if (err) { netdev_err(dev, "Device %s failed to register rx_handler\n", portname); goto err_handler_register; } err = team_upper_dev_link(team, port, extack); if (err) { netdev_err(dev, "Device %s failed to set upper link\n", portname); goto err_set_upper_link; } err = __team_option_inst_add_port(team, port); if (err) { netdev_err(dev, "Device %s failed to add per-port options\n", portname); goto err_option_port_add; } /* set promiscuity level to new slave */ if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(port_dev, 1); if (err) goto err_set_slave_promisc; } /* set allmulti level to new slave */ if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(port_dev, 1); if (err) { if (dev->flags & IFF_PROMISC) dev_set_promiscuity(port_dev, -1); goto err_set_slave_promisc; } } if (dev->flags & IFF_UP) { netif_addr_lock_bh(dev); dev_uc_sync_multiple(port_dev, dev); dev_mc_sync_multiple(port_dev, dev); netif_addr_unlock_bh(dev); } port->index = -1; list_add_tail_rcu(&port->list, &team->port_list); team_port_enable(team, port); __team_compute_features(team); __team_port_change_port_added(port, !!netif_oper_up(port_dev)); __team_options_change_check(team); netdev_info(dev, "Port device %s added\n", portname); return 0; err_set_slave_promisc: __team_option_inst_del_port(team, port); err_option_port_add: team_upper_dev_unlink(team, port); err_set_upper_link: netdev_rx_handler_unregister(port_dev); err_handler_register: team_port_disable_netpoll(port); err_enable_netpoll: vlan_vids_del_by_dev(port_dev, dev); err_vids_add: dev_close(port_dev); err_dev_open: team_port_leave(team, port); team_port_set_orig_dev_addr(port); err_port_enter: dev_set_mtu(port_dev, port->orig.mtu); err_set_mtu: kfree(port); return err; } static void __team_port_change_port_removed(struct team_port *port); static int team_port_del(struct team *team, struct net_device *port_dev) { struct net_device *dev = team->dev; struct team_port *port; char *portname = port_dev->name; port = team_port_get_rtnl(port_dev); if (!port || !team_port_find(team, port)) { netdev_err(dev, "Device %s does not act as a port of this team\n", portname); return -ENOENT; } team_port_disable(team, port); list_del_rcu(&port->list); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(port_dev, -1); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(port_dev, -1); team_upper_dev_unlink(team, port); netdev_rx_handler_unregister(port_dev); team_port_disable_netpoll(port); vlan_vids_del_by_dev(port_dev, dev); if (dev->flags & IFF_UP) { dev_uc_unsync(port_dev, dev); dev_mc_unsync(port_dev, dev); } dev_close(port_dev); team_port_leave(team, port); __team_option_inst_mark_removed_port(team, port); __team_options_change_check(team); __team_option_inst_del_port(team, port); __team_port_change_port_removed(port); team_port_set_orig_dev_addr(port); dev_set_mtu(port_dev, port->orig.mtu); kfree_rcu(port, rcu); netdev_info(dev, "Port device %s removed\n", portname); __team_compute_features(team); return 0; } /***************** * Net device ops *****************/ static void team_mode_option_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.str_val = team->mode->kind; } static int team_mode_option_set(struct team *team, struct team_gsetter_ctx *ctx) { return team_change_mode(team, ctx->data.str_val); } static void team_notify_peers_count_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->notify_peers.count; } static int team_notify_peers_count_set(struct team *team, struct team_gsetter_ctx *ctx) { team->notify_peers.count = ctx->data.u32_val; return 0; } static void team_notify_peers_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->notify_peers.interval; } static int team_notify_peers_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { team->notify_peers.interval = ctx->data.u32_val; return 0; } static void team_mcast_rejoin_count_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->mcast_rejoin.count; } static int team_mcast_rejoin_count_set(struct team *team, struct team_gsetter_ctx *ctx) { team->mcast_rejoin.count = ctx->data.u32_val; return 0; } static void team_mcast_rejoin_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->mcast_rejoin.interval; } static int team_mcast_rejoin_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { team->mcast_rejoin.interval = ctx->data.u32_val; return 0; } static void team_port_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = team_port_enabled(port); } static int team_port_en_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; if (ctx->data.bool_val) team_port_enable(team, port); else team_port_disable(team, port); return 0; } static void team_user_linkup_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = port->user.linkup; } static void __team_carrier_check(struct team *team); static int team_user_linkup_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; port->user.linkup = ctx->data.bool_val; team_refresh_port_linkup(port); __team_carrier_check(port->team); return 0; } static void team_user_linkup_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = port->user.linkup_enabled; } static int team_user_linkup_en_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; port->user.linkup_enabled = ctx->data.bool_val; team_refresh_port_linkup(port); __team_carrier_check(port->team); return 0; } static void team_priority_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.s32_val = port->priority; } static int team_priority_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; s32 priority = ctx->data.s32_val; if (port->priority == priority) return 0; port->priority = priority; team_queue_override_port_prio_changed(team, port); return 0; } static void team_queue_id_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.u32_val = port->queue_id; } static int team_queue_id_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; u16 new_queue_id = ctx->data.u32_val; if (port->queue_id == new_queue_id) return 0; if (new_queue_id >= team->dev->real_num_tx_queues) return -EINVAL; team_queue_override_port_change_queue_id(team, port, new_queue_id); return 0; } static const struct team_option team_options[] = { { .name = "mode", .type = TEAM_OPTION_TYPE_STRING, .getter = team_mode_option_get, .setter = team_mode_option_set, }, { .name = "notify_peers_count", .type = TEAM_OPTION_TYPE_U32, .getter = team_notify_peers_count_get, .setter = team_notify_peers_count_set, }, { .name = "notify_peers_interval", .type = TEAM_OPTION_TYPE_U32, .getter = team_notify_peers_interval_get, .setter = team_notify_peers_interval_set, }, { .name = "mcast_rejoin_count", .type = TEAM_OPTION_TYPE_U32, .getter = team_mcast_rejoin_count_get, .setter = team_mcast_rejoin_count_set, }, { .name = "mcast_rejoin_interval", .type = TEAM_OPTION_TYPE_U32, .getter = team_mcast_rejoin_interval_get, .setter = team_mcast_rejoin_interval_set, }, { .name = "enabled", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_port_en_option_get, .setter = team_port_en_option_set, }, { .name = "user_linkup", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_user_linkup_option_get, .setter = team_user_linkup_option_set, }, { .name = "user_linkup_enabled", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_user_linkup_en_option_get, .setter = team_user_linkup_en_option_set, }, { .name = "priority", .type = TEAM_OPTION_TYPE_S32, .per_port = true, .getter = team_priority_option_get, .setter = team_priority_option_set, }, { .name = "queue_id", .type = TEAM_OPTION_TYPE_U32, .per_port = true, .getter = team_queue_id_option_get, .setter = team_queue_id_option_set, }, }; static int team_init(struct net_device *dev) { struct team *team = netdev_priv(dev); int i; int err; team->dev = dev; team_set_no_mode(team); team->notifier_ctx = false; team->pcpu_stats = netdev_alloc_pcpu_stats(struct team_pcpu_stats); if (!team->pcpu_stats) return -ENOMEM; for (i = 0; i < TEAM_PORT_HASHENTRIES; i++) INIT_HLIST_HEAD(&team->en_port_hlist[i]); INIT_LIST_HEAD(&team->port_list); err = team_queue_override_init(team); if (err) goto err_team_queue_override_init; team_adjust_ops(team); INIT_LIST_HEAD(&team->option_list); INIT_LIST_HEAD(&team->option_inst_list); team_notify_peers_init(team); team_mcast_rejoin_init(team); err = team_options_register(team, team_options, ARRAY_SIZE(team_options)); if (err) goto err_options_register; netif_carrier_off(dev); lockdep_register_key(&team->team_lock_key); __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); netdev_lockdep_set_classes(dev); return 0; err_options_register: team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); err_team_queue_override_init: free_percpu(team->pcpu_stats); return err; } static void team_uninit(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; struct team_port *tmp; mutex_lock(&team->lock); list_for_each_entry_safe(port, tmp, &team->port_list, list) team_port_del(team, port->dev); __team_change_mode(team, NULL); /* cleanup */ __team_options_unregister(team, team_options, ARRAY_SIZE(team_options)); team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); mutex_unlock(&team->lock); netdev_change_features(dev); lockdep_unregister_key(&team->team_lock_key); } static void team_destructor(struct net_device *dev) { struct team *team = netdev_priv(dev); free_percpu(team->pcpu_stats); } static int team_open(struct net_device *dev) { return 0; } static int team_close(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; list_for_each_entry(port, &team->port_list, list) { dev_uc_unsync(port->dev, dev); dev_mc_unsync(port->dev, dev); } return 0; } /* * note: already called with rcu_read_lock */ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) { struct team *team = netdev_priv(dev); bool tx_success; unsigned int len = skb->len; tx_success = team_queue_override_transmit(team, skb); if (!tx_success) tx_success = team->ops.transmit(team, skb); if (tx_success) { struct team_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(team->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(team->pcpu_stats->tx_dropped); } return NETDEV_TX_OK; } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* * This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to * skb_tx_hash and will put the skbs in the queue we expect on their * way down to the team driver. */ u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; /* * Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; if (unlikely(txq >= dev->real_num_tx_queues)) { do { txq -= dev->real_num_tx_queues; } while (txq >= dev->real_num_tx_queues); } return txq; } static void team_change_rx_flags(struct net_device *dev, int change) { struct team *team = netdev_priv(dev); struct team_port *port; int inc; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { if (change & IFF_PROMISC) { inc = dev->flags & IFF_PROMISC ? 1 : -1; dev_set_promiscuity(port->dev, inc); } if (change & IFF_ALLMULTI) { inc = dev->flags & IFF_ALLMULTI ? 1 : -1; dev_set_allmulti(port->dev, inc); } } rcu_read_unlock(); } static void team_set_rx_mode(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { dev_uc_sync_multiple(port->dev, dev); dev_mc_sync_multiple(port->dev, dev); } rcu_read_unlock(); } static int team_set_mac_address(struct net_device *dev, void *p) { struct sockaddr *addr = p; struct team *team = netdev_priv(dev); struct team_port *port; if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; dev_addr_set(dev, addr->sa_data); mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) if (team->ops.port_change_dev_addr) team->ops.port_change_dev_addr(team, port); mutex_unlock(&team->lock); return 0; } static int team_change_mtu(struct net_device *dev, int new_mtu) { struct team *team = netdev_priv(dev); struct team_port *port; int err; /* * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ mutex_lock(&team->lock); team->port_mtu_change_allowed = true; list_for_each_entry(port, &team->port_list, list) { err = dev_set_mtu(port->dev, new_mtu); if (err) { netdev_err(dev, "Device %s failed to change mtu", port->dev->name); goto unwind; } } team->port_mtu_change_allowed = false; mutex_unlock(&team->lock); dev->mtu = new_mtu; return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) dev_set_mtu(port->dev, dev->mtu); team->port_mtu_change_allowed = false; mutex_unlock(&team->lock); return err; } static void team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct team *team = netdev_priv(dev); struct team_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_dropped = 0, tx_dropped = 0, rx_nohandler = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(team->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* * rx_dropped, tx_dropped & rx_nohandler are u32, * updated without syncp protection. */ rx_dropped += READ_ONCE(p->rx_dropped); tx_dropped += READ_ONCE(p->tx_dropped); rx_nohandler += READ_ONCE(p->rx_nohandler); } stats->rx_dropped = rx_dropped; stats->tx_dropped = tx_dropped; stats->rx_nohandler = rx_nohandler; } static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct team *team = netdev_priv(dev); struct team_port *port; int err; /* * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { err = vlan_vid_add(port->dev, proto, vid); if (err) goto unwind; } mutex_unlock(&team->lock); return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); mutex_unlock(&team->lock); return err; } static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct team *team = netdev_priv(dev); struct team_port *port; mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); mutex_unlock(&team->lock); return 0; } #ifdef CONFIG_NET_POLL_CONTROLLER static void team_poll_controller(struct net_device *dev) { } static void __team_netpoll_cleanup(struct team *team) { struct team_port *port; list_for_each_entry(port, &team->port_list, list) team_port_disable_netpoll(port); } static void team_netpoll_cleanup(struct net_device *dev) { struct team *team = netdev_priv(dev); mutex_lock(&team->lock); __team_netpoll_cleanup(team); mutex_unlock(&team->lock); } static int team_netpoll_setup(struct net_device *dev, struct netpoll_info *npifo) { struct team *team = netdev_priv(dev); struct team_port *port; int err = 0; mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { err = __team_port_enable_netpoll(port); if (err) { __team_netpoll_cleanup(team); break; } } mutex_unlock(&team->lock); return err; } #endif static int team_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct team *team = netdev_priv(dev); int err; mutex_lock(&team->lock); err = team_port_add(team, port_dev, extack); mutex_unlock(&team->lock); if (!err) netdev_change_features(dev); return err; } static int team_del_slave(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); int err; mutex_lock(&team->lock); err = team_port_del(team, port_dev); mutex_unlock(&team->lock); if (err) return err; if (netif_is_team_master(port_dev)) { lockdep_unregister_key(&team->team_lock_key); lockdep_register_key(&team->team_lock_key); lockdep_set_class(&team->lock, &team->team_lock_key); } netdev_change_features(dev); return err; } static netdev_features_t team_fix_features(struct net_device *dev, netdev_features_t features) { struct team_port *port; struct team *team = netdev_priv(dev); netdev_features_t mask; mask = features; features &= ~NETIF_F_ONE_FOR_ALL; features |= NETIF_F_ALL_FOR_ALL; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { features = netdev_increment_features(features, port->dev->features, mask); } rcu_read_unlock(); features = netdev_add_tso_features(features, mask); return features; } static int team_change_carrier(struct net_device *dev, bool new_carrier) { struct team *team = netdev_priv(dev); team->user_carrier_enabled = true; if (new_carrier) netif_carrier_on(dev); else netif_carrier_off(dev); return 0; } static const struct net_device_ops team_netdev_ops = { .ndo_init = team_init, .ndo_uninit = team_uninit, .ndo_open = team_open, .ndo_stop = team_close, .ndo_start_xmit = team_xmit, .ndo_select_queue = team_select_queue, .ndo_change_rx_flags = team_change_rx_flags, .ndo_set_rx_mode = team_set_rx_mode, .ndo_set_mac_address = team_set_mac_address, .ndo_change_mtu = team_change_mtu, .ndo_get_stats64 = team_get_stats64, .ndo_vlan_rx_add_vid = team_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = team_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = team_poll_controller, .ndo_netpoll_setup = team_netpoll_setup, .ndo_netpoll_cleanup = team_netpoll_cleanup, #endif .ndo_add_slave = team_add_slave, .ndo_del_slave = team_del_slave, .ndo_fix_features = team_fix_features, .ndo_change_carrier = team_change_carrier, .ndo_features_check = passthru_features_check, }; /*********************** * ethtool interface ***********************/ static void team_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); strscpy(drvinfo->version, UTS_RELEASE, sizeof(drvinfo->version)); } static int team_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct team *team= netdev_priv(dev); unsigned long speed = 0; struct team_port *port; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { if (team_port_txable(port)) { if (port->state.speed != SPEED_UNKNOWN) speed += port->state.speed; if (cmd->base.duplex == DUPLEX_UNKNOWN && port->state.duplex != DUPLEX_UNKNOWN) cmd->base.duplex = port->state.duplex; } } rcu_read_unlock(); cmd->base.speed = speed ? : SPEED_UNKNOWN; return 0; } static const struct ethtool_ops team_ethtool_ops = { .get_drvinfo = team_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = team_ethtool_get_link_ksettings, }; /*********************** * rt netlink interface ***********************/ static void team_setup_by_port(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); if (port_dev->type == ARPHRD_ETHER) dev->header_ops = team->header_ops_cache; else dev->header_ops = port_dev->header_ops; dev->type = port_dev->type; dev->hard_header_len = port_dev->hard_header_len; dev->needed_headroom = port_dev->needed_headroom; dev->addr_len = port_dev->addr_len; dev->mtu = port_dev->mtu; memcpy(dev->broadcast, port_dev->broadcast, port_dev->addr_len); eth_hw_addr_inherit(dev, port_dev); if (port_dev->flags & IFF_POINTOPOINT) { dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } else if ((port_dev->flags & (IFF_BROADCAST | IFF_MULTICAST)) == (IFF_BROADCAST | IFF_MULTICAST)) { dev->flags |= (IFF_BROADCAST | IFF_MULTICAST); dev->flags &= ~(IFF_POINTOPOINT | IFF_NOARP); } } static int team_dev_type_check_change(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); char *portname = port_dev->name; int err; if (dev->type == port_dev->type) return 0; if (!list_empty(&team->port_list)) { netdev_err(dev, "Device %s is of different type\n", portname); return -EBUSY; } err = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev); err = notifier_to_errno(err); if (err) { netdev_err(dev, "Refused to change device type\n"); return err; } dev_uc_flush(dev); dev_mc_flush(dev); team_setup_by_port(dev, port_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev); return 0; } static void team_setup(struct net_device *dev) { struct team *team = netdev_priv(dev); ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; team->header_ops_cache = dev->header_ops; dev->netdev_ops = &team_netdev_ops; dev->ethtool_ops = &team_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = team_destructor; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_TEAM; /* * Indicate we support unicast address filtering. That way core won't * bring us to promisc mode in case a unicast addr is added. * Let this up to underlay drivers. */ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_GRO; /* Don't allow team devices to change network namespaces. */ dev->features |= NETIF_F_NETNS_LOCAL; dev->hw_features = TEAM_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_FILTER; dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; dev->features |= dev->hw_features; dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } static int team_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); return register_netdevice(dev); } static int team_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static unsigned int team_get_num_tx_queues(void) { return TEAM_DEFAULT_NUM_TX_QUEUES; } static unsigned int team_get_num_rx_queues(void) { return TEAM_DEFAULT_NUM_RX_QUEUES; } static struct rtnl_link_ops team_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct team), .setup = team_setup, .newlink = team_newlink, .validate = team_validate, .get_num_tx_queues = team_get_num_tx_queues, .get_num_rx_queues = team_get_num_rx_queues, }; /*********************************** * Generic netlink custom interface ***********************************/ static struct genl_family team_nl_family; static const struct nla_policy team_nl_policy[TEAM_ATTR_MAX + 1] = { [TEAM_ATTR_UNSPEC] = { .type = NLA_UNSPEC, }, [TEAM_ATTR_TEAM_IFINDEX] = { .type = NLA_U32 }, [TEAM_ATTR_LIST_OPTION] = { .type = NLA_NESTED }, [TEAM_ATTR_LIST_PORT] = { .type = NLA_NESTED }, }; static const struct nla_policy team_nl_option_policy[TEAM_ATTR_OPTION_MAX + 1] = { [TEAM_ATTR_OPTION_UNSPEC] = { .type = NLA_UNSPEC, }, [TEAM_ATTR_OPTION_NAME] = { .type = NLA_STRING, .len = TEAM_STRING_MAX_LEN, }, [TEAM_ATTR_OPTION_CHANGED] = { .type = NLA_FLAG }, [TEAM_ATTR_OPTION_TYPE] = { .type = NLA_U8 }, [TEAM_ATTR_OPTION_DATA] = { .type = NLA_BINARY }, [TEAM_ATTR_OPTION_PORT_IFINDEX] = { .type = NLA_U32 }, [TEAM_ATTR_OPTION_ARRAY_INDEX] = { .type = NLA_U32 }, }; static int team_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *hdr; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &team_nl_family, 0, TEAM_CMD_NOOP); if (!hdr) { err = -EMSGSIZE; goto err_msg_put; } genlmsg_end(msg, hdr); return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); err_msg_put: nlmsg_free(msg); return err; } /* * Netlink cmd functions should be locked by following two functions. * Since dev gets held here, that ensures dev won't disappear in between. */ static struct team *team_nl_team_get(struct genl_info *info) { struct net *net = genl_info_net(info); int ifindex; struct net_device *dev; struct team *team; if (!info->attrs[TEAM_ATTR_TEAM_IFINDEX]) return NULL; ifindex = nla_get_u32(info->attrs[TEAM_ATTR_TEAM_IFINDEX]); dev = dev_get_by_index(net, ifindex); if (!dev || dev->netdev_ops != &team_netdev_ops) { dev_put(dev); return NULL; } team = netdev_priv(dev); mutex_lock(&team->lock); return team; } static void team_nl_team_put(struct team *team) { mutex_unlock(&team->lock); dev_put(team->dev); } typedef int team_nl_send_func_t(struct sk_buff *skb, struct team *team, u32 portid); static int team_nl_send_unicast(struct sk_buff *skb, struct team *team, u32 portid) { return genlmsg_unicast(dev_net(team->dev), skb, portid); } static int team_nl_fill_one_option_get(struct sk_buff *skb, struct team *team, struct team_option_inst *opt_inst) { struct nlattr *option_item; struct team_option *option = opt_inst->option; struct team_option_inst_info *opt_inst_info = &opt_inst->info; struct team_gsetter_ctx ctx; int err; ctx.info = opt_inst_info; err = team_option_get(team, opt_inst, &ctx); if (err) return err; option_item = nla_nest_start_noflag(skb, TEAM_ATTR_ITEM_OPTION); if (!option_item) return -EMSGSIZE; if (nla_put_string(skb, TEAM_ATTR_OPTION_NAME, option->name)) goto nest_cancel; if (opt_inst_info->port && nla_put_u32(skb, TEAM_ATTR_OPTION_PORT_IFINDEX, opt_inst_info->port->dev->ifindex)) goto nest_cancel; if (opt_inst->option->array_size && nla_put_u32(skb, TEAM_ATTR_OPTION_ARRAY_INDEX, opt_inst_info->array_index)) goto nest_cancel; switch (option->type) { case TEAM_OPTION_TYPE_U32: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_U32)) goto nest_cancel; if (nla_put_u32(skb, TEAM_ATTR_OPTION_DATA, ctx.data.u32_val)) goto nest_cancel; break; case TEAM_OPTION_TYPE_STRING: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_STRING)) goto nest_cancel; if (nla_put_string(skb, TEAM_ATTR_OPTION_DATA, ctx.data.str_val)) goto nest_cancel; break; case TEAM_OPTION_TYPE_BINARY: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_BINARY)) goto nest_cancel; if (nla_put(skb, TEAM_ATTR_OPTION_DATA, ctx.data.bin_val.len, ctx.data.bin_val.ptr)) goto nest_cancel; break; case TEAM_OPTION_TYPE_BOOL: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_FLAG)) goto nest_cancel; if (ctx.data.bool_val && nla_put_flag(skb, TEAM_ATTR_OPTION_DATA)) goto nest_cancel; break; case TEAM_OPTION_TYPE_S32: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_S32)) goto nest_cancel; if (nla_put_s32(skb, TEAM_ATTR_OPTION_DATA, ctx.data.s32_val)) goto nest_cancel; break; default: BUG(); } if (opt_inst->removed && nla_put_flag(skb, TEAM_ATTR_OPTION_REMOVED)) goto nest_cancel; if (opt_inst->changed) { if (nla_put_flag(skb, TEAM_ATTR_OPTION_CHANGED)) goto nest_cancel; opt_inst->changed = false; } nla_nest_end(skb, option_item); return 0; nest_cancel: nla_nest_cancel(skb, option_item); return -EMSGSIZE; } static int __send_and_alloc_skb(struct sk_buff **pskb, struct team *team, u32 portid, team_nl_send_func_t *send_func) { int err; if (*pskb) { err = send_func(*pskb, team, portid); if (err) return err; } *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!*pskb) return -ENOMEM; return 0; } static int team_nl_send_options_get(struct team *team, u32 portid, u32 seq, int flags, team_nl_send_func_t *send_func, struct list_head *sel_opt_inst_list) { struct nlattr *option_list; struct nlmsghdr *nlh; void *hdr; struct team_option_inst *opt_inst; int err; struct sk_buff *skb = NULL; bool incomplete; int i; opt_inst = list_first_entry(sel_opt_inst_list, struct team_option_inst, tmp_list); start_again: err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_OPTIONS_GET); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; option_list = nla_nest_start_noflag(skb, TEAM_ATTR_LIST_OPTION); if (!option_list) goto nla_put_failure; i = 0; incomplete = false; list_for_each_entry_from(opt_inst, sel_opt_inst_list, tmp_list) { err = team_nl_fill_one_option_get(skb, team, opt_inst); if (err) { if (err == -EMSGSIZE) { if (!i) goto errout; incomplete = true; break; } goto errout; } i++; } nla_nest_end(skb, option_list); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, portid, seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; goto send_done; } return send_func(skb, team, portid); nla_put_failure: err = -EMSGSIZE; errout: nlmsg_free(skb); return err; } static int team_nl_cmd_options_get(struct sk_buff *skb, struct genl_info *info) { struct team *team; struct team_option_inst *opt_inst; int err; LIST_HEAD(sel_opt_inst_list); team = team_nl_team_get(info); if (!team) return -EINVAL; list_for_each_entry(opt_inst, &team->option_inst_list, list) list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list); err = team_nl_send_options_get(team, info->snd_portid, info->snd_seq, NLM_F_ACK, team_nl_send_unicast, &sel_opt_inst_list); team_nl_team_put(team); return err; } static int team_nl_send_event_options_get(struct team *team, struct list_head *sel_opt_inst_list); static int team_nl_cmd_options_set(struct sk_buff *skb, struct genl_info *info) { struct team *team; int err = 0; int i; struct nlattr *nl_option; rtnl_lock(); team = team_nl_team_get(info); if (!team) { err = -EINVAL; goto rtnl_unlock; } err = -EINVAL; if (!info->attrs[TEAM_ATTR_LIST_OPTION]) { err = -EINVAL; goto team_put; } nla_for_each_nested(nl_option, info->attrs[TEAM_ATTR_LIST_OPTION], i) { struct nlattr *opt_attrs[TEAM_ATTR_OPTION_MAX + 1]; struct nlattr *attr; struct nlattr *attr_data; LIST_HEAD(opt_inst_list); enum team_option_type opt_type; int opt_port_ifindex = 0; /* != 0 for per-port options */ u32 opt_array_index = 0; bool opt_is_array = false; struct team_option_inst *opt_inst; char *opt_name; bool opt_found = false; if (nla_type(nl_option) != TEAM_ATTR_ITEM_OPTION) { err = -EINVAL; goto team_put; } err = nla_parse_nested_deprecated(opt_attrs, TEAM_ATTR_OPTION_MAX, nl_option, team_nl_option_policy, info->extack); if (err) goto team_put; if (!opt_attrs[TEAM_ATTR_OPTION_NAME] || !opt_attrs[TEAM_ATTR_OPTION_TYPE]) { err = -EINVAL; goto team_put; } switch (nla_get_u8(opt_attrs[TEAM_ATTR_OPTION_TYPE])) { case NLA_U32: opt_type = TEAM_OPTION_TYPE_U32; break; case NLA_STRING: opt_type = TEAM_OPTION_TYPE_STRING; break; case NLA_BINARY: opt_type = TEAM_OPTION_TYPE_BINARY; break; case NLA_FLAG: opt_type = TEAM_OPTION_TYPE_BOOL; break; case NLA_S32: opt_type = TEAM_OPTION_TYPE_S32; break; default: goto team_put; } attr_data = opt_attrs[TEAM_ATTR_OPTION_DATA]; if (opt_type != TEAM_OPTION_TYPE_BOOL && !attr_data) { err = -EINVAL; goto team_put; } opt_name = nla_data(opt_attrs[TEAM_ATTR_OPTION_NAME]); attr = opt_attrs[TEAM_ATTR_OPTION_PORT_IFINDEX]; if (attr) opt_port_ifindex = nla_get_u32(attr); attr = opt_attrs[TEAM_ATTR_OPTION_ARRAY_INDEX]; if (attr) { opt_is_array = true; opt_array_index = nla_get_u32(attr); } list_for_each_entry(opt_inst, &team->option_inst_list, list) { struct team_option *option = opt_inst->option; struct team_gsetter_ctx ctx; struct team_option_inst_info *opt_inst_info; int tmp_ifindex; opt_inst_info = &opt_inst->info; tmp_ifindex = opt_inst_info->port ? opt_inst_info->port->dev->ifindex : 0; if (option->type != opt_type || strcmp(option->name, opt_name) || tmp_ifindex != opt_port_ifindex || (option->array_size && !opt_is_array) || opt_inst_info->array_index != opt_array_index) continue; opt_found = true; ctx.info = opt_inst_info; switch (opt_type) { case TEAM_OPTION_TYPE_U32: ctx.data.u32_val = nla_get_u32(attr_data); break; case TEAM_OPTION_TYPE_STRING: if (nla_len(attr_data) > TEAM_STRING_MAX_LEN) { err = -EINVAL; goto team_put; } ctx.data.str_val = nla_data(attr_data); break; case TEAM_OPTION_TYPE_BINARY: ctx.data.bin_val.len = nla_len(attr_data); ctx.data.bin_val.ptr = nla_data(attr_data); break; case TEAM_OPTION_TYPE_BOOL: ctx.data.bool_val = attr_data ? true : false; break; case TEAM_OPTION_TYPE_S32: ctx.data.s32_val = nla_get_s32(attr_data); break; default: BUG(); } err = team_option_set(team, opt_inst, &ctx); if (err) goto team_put; opt_inst->changed = true; list_add(&opt_inst->tmp_list, &opt_inst_list); } if (!opt_found) { err = -ENOENT; goto team_put; } err = team_nl_send_event_options_get(team, &opt_inst_list); if (err) break; } team_put: team_nl_team_put(team); rtnl_unlock: rtnl_unlock(); return err; } static int team_nl_fill_one_port_get(struct sk_buff *skb, struct team_port *port) { struct nlattr *port_item; port_item = nla_nest_start_noflag(skb, TEAM_ATTR_ITEM_PORT); if (!port_item) goto nest_cancel; if (nla_put_u32(skb, TEAM_ATTR_PORT_IFINDEX, port->dev->ifindex)) goto nest_cancel; if (port->changed) { if (nla_put_flag(skb, TEAM_ATTR_PORT_CHANGED)) goto nest_cancel; port->changed = false; } if ((port->removed && nla_put_flag(skb, TEAM_ATTR_PORT_REMOVED)) || (port->state.linkup && nla_put_flag(skb, TEAM_ATTR_PORT_LINKUP)) || nla_put_u32(skb, TEAM_ATTR_PORT_SPEED, port->state.speed) || nla_put_u8(skb, TEAM_ATTR_PORT_DUPLEX, port->state.duplex)) goto nest_cancel; nla_nest_end(skb, port_item); return 0; nest_cancel: nla_nest_cancel(skb, port_item); return -EMSGSIZE; } static int team_nl_send_port_list_get(struct team *team, u32 portid, u32 seq, int flags, team_nl_send_func_t *send_func, struct team_port *one_port) { struct nlattr *port_list; struct nlmsghdr *nlh; void *hdr; struct team_port *port; int err; struct sk_buff *skb = NULL; bool incomplete; int i; port = list_first_entry_or_null(&team->port_list, struct team_port, list); start_again: err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_PORT_LIST_GET); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; port_list = nla_nest_start_noflag(skb, TEAM_ATTR_LIST_PORT); if (!port_list) goto nla_put_failure; i = 0; incomplete = false; /* If one port is selected, called wants to send port list containing * only this port. Otherwise go through all listed ports and send all */ if (one_port) { err = team_nl_fill_one_port_get(skb, one_port); if (err) goto errout; } else if (port) { list_for_each_entry_from(port, &team->port_list, list) { err = team_nl_fill_one_port_get(skb, port); if (err) { if (err == -EMSGSIZE) { if (!i) goto errout; incomplete = true; break; } goto errout; } i++; } } nla_nest_end(skb, port_list); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, portid, seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; goto send_done; } return send_func(skb, team, portid); nla_put_failure: err = -EMSGSIZE; errout: nlmsg_free(skb); return err; } static int team_nl_cmd_port_list_get(struct sk_buff *skb, struct genl_info *info) { struct team *team; int err; team = team_nl_team_get(info); if (!team) return -EINVAL; err = team_nl_send_port_list_get(team, info->snd_portid, info->snd_seq, NLM_F_ACK, team_nl_send_unicast, NULL); team_nl_team_put(team); return err; } static const struct genl_small_ops team_nl_ops[] = { { .cmd = TEAM_CMD_NOOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = team_nl_cmd_noop, }, { .cmd = TEAM_CMD_OPTIONS_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = team_nl_cmd_options_set, .flags = GENL_ADMIN_PERM, }, { .cmd = TEAM_CMD_OPTIONS_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = team_nl_cmd_options_get, .flags = GENL_ADMIN_PERM, }, { .cmd = TEAM_CMD_PORT_LIST_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = team_nl_cmd_port_list_get, .flags = GENL_ADMIN_PERM, }, }; static const struct genl_multicast_group team_nl_mcgrps[] = { { .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, }, }; static struct genl_family team_nl_family __ro_after_init = { .name = TEAM_GENL_NAME, .version = TEAM_GENL_VERSION, .maxattr = TEAM_ATTR_MAX, .policy = team_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = team_nl_ops, .n_small_ops = ARRAY_SIZE(team_nl_ops), .resv_start_op = TEAM_CMD_PORT_LIST_GET + 1, .mcgrps = team_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(team_nl_mcgrps), }; static int team_nl_send_multicast(struct sk_buff *skb, struct team *team, u32 portid) { return genlmsg_multicast_netns(&team_nl_family, dev_net(team->dev), skb, 0, 0, GFP_KERNEL); } static int team_nl_send_event_options_get(struct team *team, struct list_head *sel_opt_inst_list) { return team_nl_send_options_get(team, 0, 0, 0, team_nl_send_multicast, sel_opt_inst_list); } static int team_nl_send_event_port_get(struct team *team, struct team_port *port) { return team_nl_send_port_list_get(team, 0, 0, 0, team_nl_send_multicast, port); } static int __init team_nl_init(void) { return genl_register_family(&team_nl_family); } static void __exit team_nl_fini(void) { genl_unregister_family(&team_nl_family); } /****************** * Change checkers ******************/ static void __team_options_change_check(struct team *team) { int err; struct team_option_inst *opt_inst; LIST_HEAD(sel_opt_inst_list); list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->changed) list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list); } err = team_nl_send_event_options_get(team, &sel_opt_inst_list); if (err && err != -ESRCH) netdev_warn(team->dev, "Failed to send options change via netlink (err %d)\n", err); } /* rtnl lock is held */ static void __team_port_change_send(struct team_port *port, bool linkup) { int err; port->changed = true; port->state.linkup = linkup; team_refresh_port_linkup(port); if (linkup) { struct ethtool_link_ksettings ecmd; err = __ethtool_get_link_ksettings(port->dev, &ecmd); if (!err) { port->state.speed = ecmd.base.speed; port->state.duplex = ecmd.base.duplex; goto send_event; } } port->state.speed = 0; port->state.duplex = 0; send_event: err = team_nl_send_event_port_get(port->team, port); if (err && err != -ESRCH) netdev_warn(port->team->dev, "Failed to send port change of device %s via netlink (err %d)\n", port->dev->name, err); } static void __team_carrier_check(struct team *team) { struct team_port *port; bool team_linkup; if (team->user_carrier_enabled) return; team_linkup = false; list_for_each_entry(port, &team->port_list, list) { if (port->linkup) { team_linkup = true; break; } } if (team_linkup) netif_carrier_on(team->dev); else netif_carrier_off(team->dev); } static void __team_port_change_check(struct team_port *port, bool linkup) { if (port->state.linkup != linkup) __team_port_change_send(port, linkup); __team_carrier_check(port->team); } static void __team_port_change_port_added(struct team_port *port, bool linkup) { __team_port_change_send(port, linkup); __team_carrier_check(port->team); } static void __team_port_change_port_removed(struct team_port *port) { port->removed = true; __team_port_change_send(port, false); __team_carrier_check(port->team); } static void team_port_change_check(struct team_port *port, bool linkup) { struct team *team = port->team; mutex_lock(&team->lock); __team_port_change_check(port, linkup); mutex_unlock(&team->lock); } /************************************ * Net device notifier event handler ************************************/ static int team_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct team_port *port; port = team_port_get_rtnl(dev); if (!port) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (netif_oper_up(dev)) team_port_change_check(port, true); break; case NETDEV_DOWN: team_port_change_check(port, false); break; case NETDEV_CHANGE: if (netif_running(port->dev)) team_port_change_check(port, !!netif_oper_up(port->dev)); break; case NETDEV_UNREGISTER: team_del_slave(port->team->dev, dev); break; case NETDEV_FEAT_CHANGE: if (!port->team->notifier_ctx) { port->team->notifier_ctx = true; team_compute_features(port->team); port->team->notifier_ctx = false; } break; case NETDEV_PRECHANGEMTU: /* Forbid to change mtu of underlaying device */ if (!port->team->port_mtu_change_allowed) return NOTIFY_BAD; break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid to change type of underlaying device */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, port->team->dev); break; } return NOTIFY_DONE; } static struct notifier_block team_notifier_block __read_mostly = { .notifier_call = team_device_event, }; /*********************** * Module init and exit ***********************/ static int __init team_module_init(void) { int err; register_netdevice_notifier(&team_notifier_block); err = rtnl_link_register(&team_link_ops); if (err) goto err_rtnl_reg; err = team_nl_init(); if (err) goto err_nl_init; return 0; err_nl_init: rtnl_link_unregister(&team_link_ops); err_rtnl_reg: unregister_netdevice_notifier(&team_notifier_block); return err; } static void __exit team_module_exit(void) { team_nl_fini(); rtnl_link_unregister(&team_link_ops); unregister_netdevice_notifier(&team_notifier_block); } module_init(team_module_init); module_exit(team_module_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>"); MODULE_DESCRIPTION("Ethernet team device driver"); MODULE_ALIAS_RTNL_LINK(DRV_NAME);
1241 1216 2615 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* * Compatibility functions which bloat the callers too much to make inline. * All of the callers of these functions should be converted to use folios * eventually. */ #include <linux/migrate.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include "internal.h" struct address_space *page_mapping(struct page *page) { return folio_mapping(page_folio(page)); } EXPORT_SYMBOL(page_mapping); void unlock_page(struct page *page) { return folio_unlock(page_folio(page)); } EXPORT_SYMBOL(unlock_page); void end_page_writeback(struct page *page) { return folio_end_writeback(page_folio(page)); } EXPORT_SYMBOL(end_page_writeback); void wait_on_page_writeback(struct page *page) { return folio_wait_writeback(page_folio(page)); } EXPORT_SYMBOL_GPL(wait_on_page_writeback); void wait_for_stable_page(struct page *page) { return folio_wait_stable(page_folio(page)); } EXPORT_SYMBOL_GPL(wait_for_stable_page); void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); } EXPORT_SYMBOL(mark_page_accessed); bool set_page_writeback(struct page *page) { return folio_start_writeback(page_folio(page)); } EXPORT_SYMBOL(set_page_writeback); bool set_page_dirty(struct page *page) { return folio_mark_dirty(page_folio(page)); } EXPORT_SYMBOL(set_page_dirty); int __set_page_dirty_nobuffers(struct page *page) { return filemap_dirty_folio(page_mapping(page), page_folio(page)); } EXPORT_SYMBOL(__set_page_dirty_nobuffers); bool clear_page_dirty_for_io(struct page *page) { return folio_clear_dirty_for_io(page_folio(page)); } EXPORT_SYMBOL(clear_page_dirty_for_io); bool redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { return folio_redirty_for_writepage(wbc, page_folio(page)); } EXPORT_SYMBOL(redirty_page_for_writepage); void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { folio_add_lru_vma(page_folio(page), vma); } int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { return filemap_add_folio(mapping, page_folio(page), index, gfp); } EXPORT_SYMBOL(add_to_page_cache_lru); noinline struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); if (IS_ERR(folio)) return NULL; return folio_file_page(folio, index); } EXPORT_SYMBOL(pagecache_get_page); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); bool isolate_lru_page(struct page *page) { if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) return false; return folio_isolate_lru((struct folio *)page); } void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } #ifdef CONFIG_MMU void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { VM_BUG_ON_PAGE(PageTail(page), page); return folio_add_new_anon_rmap((struct folio *)page, vma, address); } #endif
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 // SPDX-License-Identifier: GPL-2.0-only /* * llc_output.c - LLC minimal output path * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/llc.h> #include <net/llc_pdu.h> /** * llc_mac_hdr_init - fills MAC header fields * @skb: Address of the frame to initialize its MAC header * @sa: The MAC source address * @da: The MAC destination address * * Fills MAC header fields, depending on MAC type. Returns 0, If MAC type * is a valid type and initialization completes correctly 1, otherwise. */ int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da) { int rc = -EINVAL; switch (skb->dev->type) { case ARPHRD_ETHER: case ARPHRD_LOOPBACK: rc = dev_hard_header(skb, skb->dev, ETH_P_802_2, da, sa, skb->len); if (rc > 0) rc = 0; break; default: break; } return rc; } /** * llc_build_and_send_ui_pkt - unitdata request interface for upper layers * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * Upper layers calls this function when upper layer wants to send data * using connection-less mode communication (UI pdu). * * Accept data frame from network layer to be sent using connection- * less mode communication; timeout/retries handled by network layer; * package primitive as an event and send to SAP event handler */ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap) { int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac); if (likely(!rc)) rc = dev_queue_xmit(skb); else kfree_skb(skb); return rc; } EXPORT_SYMBOL(llc_mac_hdr_init); EXPORT_SYMBOL(llc_build_and_send_ui_pkt);
1284 1263 1263 1291 1291 1291 27 1264 1264 1264 1264 69 68 68 1216 1216 1 1 1072 1216 1216 1216 1216 1199 1199 1216 1216 1216 1216 1216 1216 1072 1215 1216 1216 1216 1216 1097 1097 1097 1097 1097 1097 1097 1097 70 1097 1097 1097 1216 1216 1216 1216 1216 1216 1216 1216 1236 1236 26 1236 1229 1236 1212 1236 26 26 1216 1216 1216 1216 1216 1216 1216 1216 1216 1216 1230 1230 1236 1230 1236 1236 1236 1236 1236 1236 1236 1229 1230 1210 26 1230 1230 1243 1243 1222 1243 1243 1125 1243 4 1236 1324 1324 1324 1324 1324 1324 1324 1324 1290 1291 1291 1291 68 1304 364 1304 1304 364 1324 1251 1324 1324 1324 1324 40 1324 1324 1324 1284 1324 1324 1324 1324 1283 1324 1319 1251 377 1236 26 1210 40 40 1284 1284 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/secretmem.h> #include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include "internal.h" struct follow_page_context { struct dev_pagemap *pgmap; unsigned int page_mask; }; static inline void sanity_check_pinned_pages(struct page **pages, unsigned long npages) { if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; /* * We only pin anonymous pages if they are exclusive. Once pinned, we * can no longer turn them possibly shared and PageAnonExclusive() will * stick around until the page is freed. * * We'd like to verify that our pinned anonymous pages are still mapped * exclusively. The issue with anon THP is that we don't know how * they are/were mapped when pinning them. However, for anon * THP we can assume that either the given page (PTE-mapped THP) or * the head page (PMD-mapped THP) should be PageAnonExclusive(). If * neither is the case, there is certainly something wrong. */ for (; npages; npages--, pages++) { struct page *page = *pages; struct folio *folio = page_folio(page); if (is_zero_page(page) || !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio)) VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); else /* Either a PTE-mapped or a PMD-mapped THP. */ VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && !PageAnonExclusive(page), page); } } /* * Return the folio with ref appropriately incremented, * or NULL if that failed. */ static inline struct folio *try_get_folio(struct page *page, int refs) { struct folio *folio; retry: folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; if (unlikely(!folio_ref_try_add_rcu(folio, refs))) return NULL; /* * At this point we have a stable reference to the folio; but it * could be that between calling page_folio() and the refcount * increment, the folio was split, in which case we'd end up * holding a reference on a folio that has nothing to do with the page * we were given anymore. * So now that the folio is stable, recheck that the page still * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); goto retry; } return folio; } /** * try_grab_folio() - Attempt to get or pin a folio. * @page: pointer to page to be grabbed * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. * * Return: The folio containing @page (with refcount appropriately * incremented) for success, or NULL upon failure. If neither FOLL_GET * nor FOLL_PIN was set, that's considered failure, and furthermore, * a likely bug in the caller, so a warning is also emitted. */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) { struct folio *folio; if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) return NULL; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL; if (flags & FOLL_GET) return try_get_folio(page, refs); /* FOLL_PIN is set */ /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_page(page)) return page_folio(page); folio = try_get_folio(page, refs); if (!folio) return NULL; /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow * path. */ if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); return NULL; } /* * When pinning a large folio, use an exact count to track it. * * However, be sure to *also* increment the normal folio * refcount field at least once, so that the folio really * is pinned. That's why the refcount from the earlier * try_get_folio() is left intact. */ if (folio_test_large(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); /* * Adjust the pincount before re-checking the PTE for changes. * This is essentially a smp_mb() and is paired with a memory * barrier in page_try_share_anon_rmap(). */ smp_mb__after_atomic(); node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); return folio; } static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } if (!put_devmap_managed_page_refs(&folio->page, refs)) folio_put_refs(folio, refs); } /** * try_grab_page() - elevate a page's refcount by a flag-dependent amount * @page: pointer to page to be grabbed * @flags: gup flags: these are the FOLL_* flag values. * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same * time. Cases: please see the try_grab_folio() documentation, with * "refs=1". * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not * be grabbed. */ int __must_check try_grab_page(struct page *page, unsigned int flags) { struct folio *folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_inc(folio); else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_page(page)) return 0; /* * Similar to try_grab_folio(): be sure to *also* * increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); } return 0; } /** * unpin_user_page() - release a dma-pinned page * @page: pointer to page to be released * * Pages that were pinned via pin_user_pages*() must be released via either * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so * that such pages can be separately tracked and uniquely handled. In * particular, interactions with RDMA and filesystems need special handling. */ void unpin_user_page(struct page *page) { sanity_check_pinned_pages(&page, 1); gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); /** * folio_add_pin - Try to get an additional pin on a pinned folio * @folio: The folio to be pinned * * Get an additional pin on a folio we already have a pin on. Makes no change * if the folio is a zero_page. */ void folio_add_pin(struct folio *folio) { if (is_zero_folio(folio)) return; /* * Similar to try_grab_folio(): be sure to *also* increment the normal * page refcount field at least once, so that the page really is * pinned. */ if (folio_test_large(folio)) { WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1); folio_ref_inc(folio); atomic_inc(&folio->_pincount); } else { WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS); folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } } static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { struct page *next = nth_page(start, i); struct folio *folio = page_folio(next); unsigned int nr = 1; if (folio_test_large(folio)) nr = min_t(unsigned int, npages - i, folio_nr_pages(folio) - folio_page_idx(folio, next)); *ntails = nr; return folio; } static inline struct folio *gup_folio_next(struct page **list, unsigned long npages, unsigned long i, unsigned int *ntails) { struct folio *folio = page_folio(list[i]); unsigned int nr; for (nr = i + 1; nr < npages; nr++) { if (page_folio(list[nr]) != folio) break; } *ntails = nr - i; return folio; } /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page" refers to a page that has had one of the get_user_pages() * variants called on that page. * * For each page in the @pages array, make that page (or its head page, if a * compound page) dirty, if @make_dirty is true, and if the page was previously * listed as clean. In any case, releases all pages using unpin_user_page(), * possibly via unpin_user_pages(), for the non-dirty case. * * Please see the unpin_user_page() documentation for details. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; if (!make_dirty) { unpin_user_pages(pages, npages); return; } sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key * cases: * * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called * page_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes * on to call TestClearPageDirty(), and write the page * back. * * 2) This code sees the page as clean, so it calls * set_page_dirty(). The page stays dirty, despite being * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ if (!folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); /** * unpin_user_page_range_dirty_lock() - release and optionally dirty * gup-pinned page range * * @page: the starting page of a range maybe marked dirty, and definitely released. * @npages: number of consecutive pages to release. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); static void unpin_user_pages_lockless(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * Don't perform any sanity checks because we might have raced with * fork() and some anonymous pages might now actually be shared -- * which is why we're unpinning after all. */ for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } } /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by * leaving them pinned), but probably not. More likely, gup/pup returned * a hard -ERRNO error to the caller, who erroneously passed it here. */ if (WARN_ON(IS_ERR_VALUE(npages))) return; sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages); /* * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) { if (!test_bit(MMF_HAS_PINNED, mm_flags)) set_bit(MMF_HAS_PINNED, mm_flags); } #ifdef CONFIG_MMU static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { /* * When core dumping an enormous anonymous area that nobody * has touched so far, we don't want to allocate unnecessary pages or * page tables. Return error instead of NULL to skip handle_mm_fault, * then get_dump_page() will return NULL to leave a hole in the dump. * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */ if ((flags & FOLL_DUMP) && (vma_is_anonymous(vma) || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return NULL; } static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { if (flags & FOLL_TOUCH) { pte_t orig_entry = ptep_get(pte); pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } } /* Proper page table entry exists, but no corresponding struct page */ return -EEXIST; } /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ static inline bool can_follow_write_pte(pte_t pte, struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* If the pte is writable, we can write to the page. */ if (pte_write(pte)) return true; /* Maybe FOLL_FORCE is set to override it? */ if (!(flags & FOLL_FORCE)) return false; /* But FOLL_FORCE has no effect on shared mappings */ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) return false; /* ... or read-only private ones */ if (!(vma->vm_flags & VM_MAYWRITE)) return false; /* ... or already writable ones that just need to take a write fault */ if (vma->vm_flags & VM_WRITE) return false; /* * See can_change_pte_writable(): we broke COW and could map the page * writable if we have an exclusive anonymous page ... */ if (!page || !PageAnon(page) || !PageAnonExclusive(page)) return false; /* ... and a write-fault isn't required for other reasons. */ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) return false; return !userfaultfd_pte_wp(vma, pte); } static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; int ret; /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); /* * We only care about anon pages in can_follow_write_pte() and don't * have to worry about pte_devmap() because they are never anon. */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { page = NULL; goto out; } if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_page(page, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * mark_page_accessed(). */ mark_page_accessed(page); } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags); } static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); pmdval = pmdp_get_lockless(pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); if (!pmd_present(pmdval)) return no_page_table(vma, flags); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags); ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (flags & FOLL_SPLIT_PMD) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); ctx->page_mask = HPAGE_PMD_NR - 1; return page; } static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, unsigned int flags, struct follow_page_context *ctx) { pud_t *pud; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pud = pud_offset(p4dp, address); if (pud_none(*pud)) return no_page_table(vma, flags); if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); return follow_pmd_mask(vma, address, pud, flags, ctx); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, unsigned int flags, struct follow_page_context *ctx) { p4d_t *p4d; p4d = p4d_offset(pgdp, address); if (p4d_none(*p4d)) return no_page_table(vma, flags); BUILD_BUG_ON(p4d_huge(*p4d)); if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); return follow_pud_mask(vma, address, p4d, flags, ctx); } /** * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a * pointer to output page_mask * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches * the device's dev_pagemap metadata to avoid repeating expensive lookups. * * When getting an anonymous page and the caller has to trigger unsharing * of a shared anonymous page first, -EMLINK is returned. The caller should * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only * relevant with FOLL_PIN and !FOLL_WRITE. * * On output, the @ctx->page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; /* * Call hugetlb_follow_page_mask for hugetlb vmas as it will use * special hugetlb page table walking code. This eliminates the * need to check for hugetlb entries in the general walking code. */ if (is_vm_hugetlb_page(vma)) return hugetlb_follow_page_mask(vma, address, flags, &ctx->page_mask); pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); return follow_p4d_mask(vma, address, pgd, flags, ctx); } struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { struct follow_page_context ctx = { NULL }; struct page *page; if (vma_is_secretmem(vma)) return NULL; if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; /* * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect * to fail on PROT_NONE-mapped pages. */ page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pte_t entry; int ret = -EFAULT; /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; if (address > TASK_SIZE) pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return -EFAULT; pud = pud_offset(p4d, address); if (pud_none(*pud)) return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; entry = ptep_get(pte); if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; *page = vm_normal_page(*vma, address, entry); if (!*page) { if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; *page = pte_page(entry); } ret = try_grab_page(*page, gup_flags); if (unlikely(ret)) goto unmap; out: ret = 0; unmap: pte_unmap(pte); return ret; } /* * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int *flags, bool unshare, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; if (*flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ if (*flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist */ fault_flags |= FAULT_FLAG_TRIED; } if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * With FAULT_FLAG_RETRY_NOWAIT we'll never release the * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); *locked = 0; /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of * what has happened - we've just fully completed a page * fault, with the mmap lock released. Use -EAGAIN to show * that we want to take the mmap lock _again_. */ return -EAGAIN; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; } return 0; } /* * Writing to file-backed mappings which require folio dirty tracking using GUP * is a fundamentally broken operation, as kernel write access to GUP mappings * do not adhere to the semantics expected by a file system. * * Consider the following scenario:- * * 1. A folio is written to via GUP which write-faults the memory, notifying * the file system and dirtying the folio. * 2. Later, writeback is triggered, resulting in the folio being cleaned and * the PTE being marked read-only. * 3. The GUP caller writes to the folio, as it is mapped read/write via the * direct mapping. * 4. The GUP caller, now done with the page, unpins it and sets it dirty * (though it does not have to). * * This results in both data being written to a folio without writenotify, and * the folio being dirtied unexpectedly (if the caller decides to do so). */ static bool writable_file_mapping_allowed(struct vm_area_struct *vma, unsigned long gup_flags) { /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the case we disallow. */ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true; /* * If the VMA does not require dirty tracking then no problematic write * can occur either. */ return !vma_needs_dirty_tracking(vma); } static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; int write = (gup_flags & FOLL_WRITE); int foreign = (gup_flags & FOLL_REMOTE); bool vma_anon = vma_is_anonymous(vma); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; if ((gup_flags & FOLL_ANON) && !vma_anon) return -EFAULT; if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; if (vma_is_secretmem(vma)) return -EFAULT; if (write) { if (!vma_anon && !writable_file_mapping_allowed(vma, gup_flags)) return -EFAULT; if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ if (is_vm_hugetlb_page(vma)) return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could * set a breakpoint in a read-only mapping of an * executable, without corrupting the file (yet only * when that file had been opened for writing!). * Anon pages in shared mappings are surprising: now * just reject it. */ if (!is_cow_mapping(vm_flags)) return -EFAULT; } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * Is there actually any vma we can reach here which does not * have VM_MAYREAD set? */ if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } /* * gups are always data accesses, not instruction * fetches, so execute=false here */ if (!arch_vma_access_permitted(vma, write, false, foreign)) return -EFAULT; return 0; } /* * This is "vma_lookup()", but with a warning if we would have * historically expanded the stack in the GUP code. */ static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm, unsigned long addr) { #ifdef CONFIG_STACK_GROWSUP return vma_lookup(mm, addr); #else static volatile unsigned long next_warn; struct vm_area_struct *vma; unsigned long now, next; vma = find_vma(mm, addr); if (!vma || (addr >= vma->vm_start)) return vma; /* Only warn for half-way relevant accesses */ if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; if (vma->vm_start - addr > 65536) return NULL; /* Let's not warn more than once an hour.. */ now = jiffies; next = next_warn; if (next && time_before(now, next)) return NULL; next_warn = now + 60*60*HZ; /* Let people know things may have changed. */ pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n", current->comm, task_pid_nr(current), vma->vm_start, vma->vm_end, addr); dump_stack(); return NULL; #endif } /** * __get_user_pages() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If * the page is written to, set_page_dirty (or set_page_dirty_lock, as * appropriate) must be called after the page is finished with, and * before put_page is called. * * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may * be released. If this happens *@locked will be set to 0 on return. * * A caller using such a combination of @gup_flags must therefore hold the * mmap_lock for reading only, and recognize when it's been released. Otherwise, * it must be held for either reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; start = untagged_addr_remote(mm, start); VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); do { struct page *page; unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; goto next_page; } if (!vma) { ret = -EFAULT; goto out; } ret = check_vma_flags(vma, gup_flags); if (ret) goto out; } retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, &foll_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: goto retry; case -EBUSY: case -EAGAIN: ret = 0; fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: goto out; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding * struct page. If the caller expects **pages to be * filled in, bail out now, because that can't be done * for this page. */ if (pages) { ret = PTR_ERR(page); goto out; } } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } next_page: page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; if (pages) { struct page *subpage; unsigned int j; /* * This must be a large folio (and doesn't need to * be the whole folio; it can be part of it), do * the refcount work for all the subpages too. * * NOTE: here the page may not be the head page * e.g. when start addr is not thp-size aligned. * try_grab_folio() should have taken care of tail * pages. */ if (page_increm > 1) { struct folio *folio; /* * Since we already hold refcount on the * large folio, this should never fail. */ folio = try_grab_folio(page, page_increm - 1, foll_flags); if (WARN_ON_ONCE(!folio)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ gup_put_folio(page_folio(page), 1, foll_flags); ret = -EFAULT; goto out; } } for (j = 0; j < page_increm; j++) { subpage = nth_page(page, j); pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); } } i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) return false; /* * The architecture might have a hardware protection * mechanism other than read/write that can deny access. * * gup always represents data access, not instruction * fetches, so execute=false here: */ if (!arch_vma_access_permitted(vma, write, false, foreign)) return false; return true; } /** * fixup_user_fault() - manually resolve a user page fault * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller * does not allow retry. If NULL, the caller must guarantee * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() * section), this returns -EFAULT, and we want to resolve the user fault before * trying again. * * Typically this is meant to be used by the futex code. * * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * * This function will not return with an unlocked mmap_lock. So it has not the * same semantics wrt the @mm->mmap_lock as does filemap_fault(). */ int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { struct vm_area_struct *vma; vm_fault_t ret; address = untagged_addr_remote(mm, address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = gup_vma_lookup(mm, address); if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current)) return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; } return 0; } EXPORT_SYMBOL_GPL(fixup_user_fault); /* * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is * specified, it'll also respond to generic signals. The caller of GUP * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. */ static bool gup_signal_pending(unsigned int flags) { if (fatal_signal_pending(current)) return true; if (!(flags & FOLL_INTERRUPTIBLE)) return false; return signal_pending(current); } /* * Locking: (*locked == 1) means that the mmap_lock has already been acquired by * the caller. This function may drop the mmap_lock. If it does so, then it will * set (*locked = 0). * * (*locked == 0) means that the caller expects this function to acquire and * drop the mmap_lock. Therefore, the value of *locked will still be zero when * the function returns, even though it may have changed temporarily during * function execution. * * Please note that this function, unlike __get_user_pages(), will not return 0 * for nr_pages > 0, unless FOLL_NOWAIT is used. */ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int flags) { long ret, pages_done; bool must_unlock = false; if (!nr_pages) return 0; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } else mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has * carelessly failed to specify FOLL_GET), so keep doing that, but only * for FOLL_GET, not for the newer FOLL_PIN. * * FOLL_PIN always expects pages to be non-null, but no need to assert * that here, as any failures will be obvious enough. */ if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { BUG_ON(ret < 0); BUG_ON(ret >= nr_pages); } if (ret > 0) { nr_pages -= ret; pages_done += ret; if (!nr_pages) break; } if (*locked) { /* * VM_FAULT_RETRY didn't trigger or it was a * FOLL_NOWAIT. */ if (!pages_done) pages_done = ret; break; } /* * VM_FAULT_RETRY triggered, so seek to the faulting offset. * For the prefault case (!pages) we only update counts. */ if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; /* The lock was temporarily dropped, so we must unlock later */ must_unlock = true; retry: /* * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted * by fatal signals of even common signals, depending on * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) { BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; } *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); goto retry; } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) pages_done = ret; break; } nr_pages--; pages_done++; if (!nr_pages) break; if (likely(pages)) pages++; start += PAGE_SIZE; } if (must_unlock && *locked) { /* * We either temporarily dropped the lock, or the caller * requested that we both acquire and drop the lock. Either way, * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; } /* * Failing to pin anything implies something has gone wrong (except when * FOLL_NOWAIT is specified). */ if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT))) return -EFAULT; return pages_done; } /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address * @locked: whether the mmap_lock is still held * * This takes care of mlocking the pages too if VM_LOCKED is set. * * Return either number of pages pinned in the vma, or a negative error * code on error. * * vma->vm_mm->mmap_lock must be held. * * If @locked is NULL, it may be held for read or write and will * be unperturbed. * * If @locked is non-NULL, it must held for read only and may be * released. If it's released, *@locked will be set to 0. */ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int local_locked = 1; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * Rightly or wrongly, the VM_LOCKONFAULT case has never used * faultin_page() to break COW, so it has no work to do here. */ if (vma->vm_flags & VM_LOCKONFAULT) return nr_pages; gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; /* * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; if (locked) gup_flags |= FOLL_UNLOCKABLE; /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; } /* * faultin_vma_page_range() - populate (prefault) page tables inside the * given VMA range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * * @vma: target vma * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * * Returns either number of processed pages in the vma, or a negative error * code on error (see __get_user_pages()). * * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and * covered by the VMA. If it's released, *@locked will be set to 0. */ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * FOLL_TOUCH: Mark page accessed and thereby young; will also mark * the page dirty with FOLL_WRITE -- which doesn't make a * difference with !FOLL_FORCE, because the page is writable * in the page table. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; if (write) gup_flags |= FOLL_WRITE; /* * We want to report -EINVAL instead of -EFAULT for any permission * problems or incompatible mappings. */ if (check_vma_flags(vma, gup_flags)) return -EINVAL; ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked); lru_add_drain(); return ret; } /* * __mm_populate - populate and/or mlock pages within a range of address space. * * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap * flags. VMAs must be already marked with the desired vm_flags, and * mmap_lock must not be held. */ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) vma = find_vma_intersection(mm, vma->vm_end, end); if (!vma) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int foll_flags) { struct vm_area_struct *vma; bool must_unlock = false; unsigned long vm_flags; long i; if (!nr_pages) return 0; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) break; /* protect what we can, including chardevs */ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) break; if (pages) { pages[i] = virt_to_page((void *)start); if (pages[i]) get_page(pages[i]); } start = (start + PAGE_SIZE) & PAGE_MASK; } if (must_unlock && *locked) { mmap_read_unlock(mm); *locked = 0; } return i ? : -EFAULT; } #endif /* !CONFIG_MMU */ /** * fault_in_writeable - fault in userspace address range for writing * @uaddr: start of address range * @size: size of address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_writeable(char __user *uaddr, size_t size) { char __user *start = uaddr, *end; if (unlikely(size == 0)) return 0; if (!user_write_access_begin(uaddr, size)) return size; if (!PAGE_ALIGNED(uaddr)) { unsafe_put_user(0, uaddr, out); uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { unsafe_put_user(0, uaddr, out); uaddr += PAGE_SIZE; } out: user_write_access_end(); if (size > uaddr - start) return size - (uaddr - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); /** * fault_in_subpage_writeable - fault in an address range for writing * @uaddr: start of address range * @size: size of address range * * Fault in a user address range for writing while checking for permissions at * sub-page granularity (e.g. arm64 MTE). This function should be used when * the caller cannot guarantee forward progress of a copy_to_user() loop. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_subpage_writeable(char __user *uaddr, size_t size) { size_t faulted_in; /* * Attempt faulting in at page granularity first for page table * permission checking. The arch-specific probe_subpage_writeable() * functions may not check for this. */ faulted_in = size - fault_in_writeable(uaddr, size); if (faulted_in) faulted_in -= probe_subpage_writeable(uaddr, faulted_in); return size - faulted_in; } EXPORT_SYMBOL(fault_in_subpage_writeable); /* * fault_in_safe_writeable - fault in an address range for writing * @uaddr: start of address range * @size: length of address range * * Faults in an address range for writing. This is primarily useful when we * already know that some or all of the pages in the address range aren't in * memory. * * Unlike fault_in_writeable(), this function is non-destructive. * * Note that we don't pin or otherwise hold the pages referenced that we fault * in. There's no guarantee that they'll stay in memory for any duration of * time. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { unsigned long start = (unsigned long)uaddr, end; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; end = PAGE_ALIGN(start + size); if (end < start) end = 0; mmap_read_lock(mm); do { if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) break; start = (start + PAGE_SIZE) & PAGE_MASK; } while (start != end); mmap_read_unlock(mm); if (size > (unsigned long)uaddr - start) return size - ((unsigned long)uaddr - start); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); /** * fault_in_readable - fault in userspace address range for reading * @uaddr: start of user address range * @size: size of user address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_readable(const char __user *uaddr, size_t size) { const char __user *start = uaddr, *end; volatile char c; if (unlikely(size == 0)) return 0; if (!user_read_access_begin(uaddr, size)) return size; if (!PAGE_ALIGNED(uaddr)) { unsafe_get_user(c, uaddr, out); uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { unsafe_get_user(c, uaddr, out); uaddr += PAGE_SIZE; } out: user_read_access_end(); (void)c; if (size > uaddr - start) return size - (uaddr - start); return 0; } EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). * * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - * allowing a hole to be left in the corefile to save disk space. * * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr) { struct page *page; int locked = 0; int ret; ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION /* * Returns the number of collected pages. Return value is always >= 0. */ static unsigned long collect_longterm_unpinnable_pages( struct list_head *movable_page_list, unsigned long nr_pages, struct page **pages) { unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); if (folio == prev_folio) continue; prev_folio = folio; if (folio_is_longterm_pinnable(folio)) continue; collected++; if (folio_is_device_coherent(folio)) continue; if (folio_test_hugetlb(folio)) { isolate_hugetlb(folio, movable_page_list); continue; } if (!folio_test_lru(folio) && drain_allow) { lru_add_drain_all(); drain_allow = false; } if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } return collected; } /* * Unpins all pages and migrates device coherent pages and movable_page_list. * Returns -EAGAIN if all pages were successfully migrated or -errno for failure * (or partial success). */ static int migrate_longterm_unpinnable_pages( struct list_head *movable_page_list, unsigned long nr_pages, struct page **pages) { int ret; unsigned long i; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); if (folio_is_device_coherent(folio)) { /* * Migration will fail if the page is pinned, so convert * the pin on the source page to a normal reference. */ pages[i] = NULL; folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); if (migrate_device_coherent_page(&folio->page)) { ret = -EBUSY; goto err; } continue; } /* * We can't migrate pages with unexpected references, so drop * the reference obtained by __get_user_pages_locked(). * Migrating pages have been added to movable_page_list after * calling folio_isolate_lru() which takes a reference so the * page won't be freed if it's migrating. */ unpin_user_page(pages[i]); pages[i] = NULL; } if (!list_empty(movable_page_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; if (migrate_pages(movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; goto err; } } putback_movable_pages(movable_page_list); return -EAGAIN; err: for (i = 0; i < nr_pages; i++) if (pages[i]) unpin_user_page(pages[i]); putback_movable_pages(movable_page_list); return ret; } /* * Check whether all pages are *allowed* to be pinned. Rather confusingly, all * pages in the range are required to be pinned via FOLL_PIN, before calling * this routine. * * If any pages in the range are not allowed to be pinned, then this routine * will migrate those pages away, unpin all the pages in the range and return * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then * call this routine again. * * If an error other than -EAGAIN occurs, this indicates a migration failure. * The caller should give up, and propagate the error back up the call stack. * * If everything is OK and all pages in the range are allowed to be pinned, then * this routine leaves all pages pinned and returns zero for success. */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { unsigned long collected; LIST_HEAD(movable_page_list); collected = collect_longterm_unpinnable_pages(&movable_page_list, nr_pages, pages); if (!collected) return 0; return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, pages); } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { return 0; } #endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which * allows us to process the FOLL_LONGTERM flag. */ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int gup_flags) { unsigned int flags; long rc, nr_pinned_pages; if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; break; } /* FOLL_LONGTERM implies FOLL_PIN */ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); return rc ? rc : nr_pinned_pages; } /* * Check that the given flags are valid for the exported gup/pup interface, and * update them with the required flags that the caller must have set. */ static bool is_valid_gup_args(struct page **pages, int *locked, unsigned int *gup_flags_p, unsigned int to_set) { unsigned int gup_flags = *gup_flags_p; /* * These flags not allowed to be specified externally to the gup * interfaces: * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only * - FOLL_REMOTE is internal only and used on follow_page() * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) return false; gup_flags |= to_set; if (locked) { /* At the external interface locked must be set */ if (WARN_ON_ONCE(*locked != 1)) return false; gup_flags |= FOLL_UNLOCKABLE; } /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return false; /* LONGTERM can only be specified when pinning */ if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM))) return false; /* Pages input must be given if using GET/PIN */ if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages)) return false; /* We want to allow the pgmap to be hot-unplugged at all times */ if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))) return false; *gup_flags_p = gup_flags; return true; } #ifdef CONFIG_MMU /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * * The caller is responsible for releasing returned @pages, via put_page(). * * Must be called with mmap_lock held for read or write. * * get_user_pages_remote walks a process's page tables and takes a reference * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * * get_user_pages_remote is typically used for fewer-copy IO operations, * to get a handle on the memory by some means other than accesses * via the user virtual addresses. The pages may be submitted for * DMA to devices or accessed via their kernel linear mapping (via the * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_remote); #else /* CONFIG_MMU */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { return 0; } #endif /* !CONFIG_MMU */ /** * get_user_pages() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * * This is the same as get_user_pages_remote(), just with a less-flexible * calling convention where we assume that the mm being operated on belongs to * the current task, and doesn't allow passing of a locked parameter. We also * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); /* * get_user_pages_unlocked() is suitable to replace the form: * * mmap_read_lock(mm); * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * with: * * get_user_pages_unlocked(mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so * get_user_pages_fast should be used instead if specific gup_flags * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH | FOLL_UNLOCKABLE)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); /* * Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be * protected from page table pages being freed from under it, and should * block any THP splits. * * One way to achieve this is to have the walker disable interrupts, and * rely on IPIs from the TLB flushing code blocking before the page table * pages are freed. This is unsuitable for architectures that do not need * to broadcast an IPI when invalidating TLBs. * * Another way to achieve this is to batch up page table containing pages * belonging to more than one mm_user, then rcu_sched a callback to free those * pages. Disabling interrupts will allow the fast_gup walker to both block * the rcu_sched callback, and an IPI that we broadcast for splitting THPs * (which is a relatively rare event). The code below adopts this strategy. * * Before activating this code, please be aware that the following assumptions * are currently made: * * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * * *) access_ok is sufficient to validate userspace address ranges. * * The last two assumptions can be relaxed by the addition of helper functions. * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_FAST_GUP /* * Used in the GUP-fast path to determine whether a pin is permitted for a * specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing * so. * * This function cannot be as thorough as that one as the VMA is not available * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags) { struct address_space *mapping; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) != (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) return true; /* The folio is pinned, so we can safely access folio fields. */ if (WARN_ON_ONCE(folio_test_slab(folio))) return false; /* hugetlb mappings do not require dirty-tracking. */ if (folio_test_hugetlb(folio)) return true; /* * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods * cannot proceed, which means no actions performed under RCU can * proceed either. * * inodes and thus their mappings are freed under RCU, which means the * mapping cannot be freed beneath us and thus we can safely dereference * it. */ lockdep_assert_irqs_disabled(); /* * However, there may be operations which _alter_ the mapping, so ensure * we read it once and only once. */ mapping = READ_ONCE(folio->mapping); /* * The mapping may have been truncated, in any case we cannot determine * if this mapping is safe - fall back to slow path to determine how to * proceed. */ if (!mapping) return false; /* Anonymous folios pose no problem. */ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; if (mapping_flags) return mapping_flags & PAGE_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an * address_space object. The only remaining whitelisted file system is * shmem. */ return shmem_mapping(mapping); } static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; ClearPageReferenced(page); if (flags & FOLL_PIN) unpin_user_page(page); else put_page(page); } } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * Fast-gup relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, fast-gup needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with fast-gup, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because fast-gup may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); if (!ptep) return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; struct folio *folio; /* * Always fallback to ordinary GUP on PROT_NONE-mapped pages: * pte_access_permitted() better should reject these pages * either way: otherwise, GUP-fast might succeed in * cases where ordinary GUP would fail due to VMA access * permissions. */ if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { if (unlikely(flags & FOLL_LONGTERM)) goto pte_unmap; pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio(page, 1, flags); if (!folio) goto pte_unmap; if (unlikely(folio_is_secretmem(folio))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please * see Documentation/core-api/pin_user_pages.rst for * details. */ if (flags & FOLL_PIN) { ret = arch_make_page_accessible(page); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } } folio_set_referenced(folio); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; pte_unmap: if (pgmap) put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; do { struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); return addr == end; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } #endif static int record_subpages(struct page *page, unsigned long addr, unsigned long end, struct page **pages) { int nr; for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) pages[nr] = nth_page(page, nr); return nr; } #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { unsigned long __boundary = (addr + sz) & ~(sz-1); return (__boundary - 1 < end - 1) ? __boundary : end; } static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long pte_end; struct page *page; struct folio *folio; pte_t pte; int refs; pte_end = (addr + sz) & ~(sz-1); if (pte_end < end) end = pte_end; pte = huge_ptep_get(ptep); if (!pte_access_permitted(pte, flags & FOLL_WRITE)) return 0; /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { pte_t *ptep; unsigned long sz = 1UL << hugepd_shift(hugepd); unsigned long next; ptep = hugepte_offset(hugepd, addr, pdshift); do { next = hugepte_addr_end(addr, end, sz); if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) return 0; } while (ptep++, addr = next, addr != end); return 1; } #else static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_HUGEPD */ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, pages, nr); } page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return __gup_device_huge_pud(orig, pudp, addr, end, flags, pages, nr); } page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *page; struct folio *folio; if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); refs = record_subpages(page, addr, end, pages + *nr); folio = try_grab_folio(page, refs, flags); if (!folio) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } if (!folio_fast_pin_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { /* See gup_pte_range() */ if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { /* * architecture have different format for hugetlbfs * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, PMD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); if (p4d_none(p4d)) return 0; BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } static void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; pgdp = pgd_offset(current->mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else static inline void gup_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { } #endif /* CONFIG_HAVE_FAST_GUP */ #ifndef gup_fast_permitted /* * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) { return true; } #endif static unsigned long lockless_pages_from_mm(unsigned long start, unsigned long end, unsigned int gup_flags, struct page **pages) { unsigned long flags; int nr_pinned = 0; unsigned seq; if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || !gup_fast_permitted(start, end)) return 0; if (gup_flags & FOLL_PIN) { seq = raw_read_seqcount(&current->mm->write_protect_seq); if (seq & 1) return 0; } /* * Disable interrupts. The nested form is used, in order to allow full, * general purpose use of this routine. * * With interrupts disabled, we block page table pages from being freed * from under us. See struct mmu_table_batch comments in * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock() here as we also want to block IPIs * that come from THPs splitting. */ local_irq_save(flags); gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); /* * When pinning pages for DMA there could be a concurrent write protect * from fork() via copy_page_range(), in this case always fail fast GUP. */ if (gup_flags & FOLL_PIN) { if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) { unpin_user_pages_lockless(pages, nr_pinned); return 0; } else { sanity_check_pinned_pages(pages, nr_pinned); } } return nr_pinned; } static int internal_get_user_pages_fast(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so * returning -errno is not an option */ if (nr_pinned) return nr_pinned; return ret; } return ret + nr_pinned; } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. * * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked);
584 3134 184 68 3114 56 18 120 59 439 1973 123 3110 581 2607 2722 2592 152 423 3236 3111 2688 3112 2726 115 2644 2689 143 3117 3114 831 2531 2528 2532 2528 1828 608 2499 2492 1179 1177 30 130 6 6 1179 24 112 155 155 1658 56 556 46 553 12 6 1 1 520 227 3 227 12 227 303 26 556 37 36 228 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 /* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H #include <linux/err.h> #include <linux/errno.h> #include <linux/jhash.h> #include <linux/list_nulls.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <linux/bit_spinlock.h> #include <linux/rhashtable-types.h> /* * Objects in an rhashtable have an embedded struct rhash_head * which is linked into as hash chain from the hash table - or one * of two or more hash tables when the rhashtable is being resized. * The end of the chain is marked with a special nulls marks which has * the least significant bit set but otherwise stores the address of * the hash bucket. This allows us to be sure we've found the end * of the right list. * The value stored in the hash bucket has BIT(0) used as a lock bit. * This bit must be atomically set before any changes are made to * the chain. To avoid dereferencing this pointer without clearing * the bit first, we use an opaque 'struct rhash_lock_head *' for the * pointer stored in the bucket. This struct needs to be defined so * that rcu_dereference() works on it, but it has no content so a * cast is needed for it to be useful. This ensures it isn't * used by mistake with clearing the lock bit first. */ struct rhash_lock_head {}; /* Maximum chain length before rehash * * The maximum (not average) chain length grows with the size of the hash * table, at a rate of (log N)/(log log N). * * The value of 16 is selected so that even if the hash table grew to * 2^32 you would not expect the maximum chain length to exceed it * unless we are under attack (or extremely unlucky). * * As this limit is only to detect attacks, we don't need to set it to a * lower value as you'd need the chain length to vastly exceed 16 to have * any real effect on the system. */ #define RHT_ELASTICITY 16u /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; unsigned int nest; u32 hash_rnd; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; struct lockdep_map dep_map; struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /* * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards * the msb. * We give it an address, in which the bottom bit is * always 0, and the msb might be significant. * So we shift the address down one bit to align with * expectations and avoid losing a significant bit. * * We never store the NULLS_MARKER in the hash table * itself as we need the lsb for locking. * Instead we store a NULL */ #define RHT_NULLS_MARKER(ptr) \ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) #define INIT_RHT_NULLS_HEAD(ptr) \ ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (char *)he - ht->p.head_offset; } static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { return hash & (tbl->size - 1); } static inline unsigned int rht_key_get_hash(struct rhashtable *ht, const void *key, const struct rhashtable_params params, unsigned int hash_rnd) { unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ if (!__builtin_constant_p(params.key_len)) hash = ht->p.hashfn(key, ht->key_len, hash_rnd); else if (params.key_len) { unsigned int key_len = params.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else if (key_len & (sizeof(u32) - 1)) hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); } else { unsigned int key_len = ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else hash = jhash(key, key_len, hash_rnd); } return hash; } static inline unsigned int rht_key_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const void *key, const struct rhashtable_params params) { unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); return rht_bucket_index(tbl, hash); } static inline unsigned int rht_head_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he, const struct rhashtable_params params) { const char *ptr = rht_obj(ht, he); return likely(params.obj_hashfn) ? rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?: ht->p.key_len, tbl->hash_rnd)) : rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); } /** * rht_grow_above_75 - returns true if nelems > 0.75 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_75(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_shrink_below_30(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && tbl->size > ht->p.min_size; } /** * rht_grow_above_100 - returns true if nelems > table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) > tbl->size && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_grow_above_max - returns true if table is above maximum * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) >= ht->max_elems; } #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); #else static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return 1; } static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { return 1; } #endif /* CONFIG_PROVE_LOCKING */ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); static inline void rhashtable_walk_start(struct rhashtable_iter *iter) { (void)rhashtable_walk_start_check(iter); } void *rhashtable_walk_next(struct rhashtable_iter *iter); void *rhashtable_walk_peek(struct rhashtable_iter *iter); void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg); void rhashtable_destroy(struct rhashtable *ht); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_rcu(p, ht) \ rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_bucket(p, tbl, hash) \ rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_dereference_bucket_rcu(p, tbl, hash) \ rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) static inline struct rhash_lock_head __rcu *const *rht_bucket( const struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : &tbl->buckets[hash]; } /* * We lock a bucket by setting BIT(0) in the pointer - this is always * zero in real pointers. The NULLS mark is never stored in the bucket, * rather we store NULL if the bucket is empty. * bit_spin_locks do not handle contention well, but the whole point * of the hashtable design is to achieve minimum per-bucket contention. * A nested hash table might not have a bucket pointer. In that case * we cannot get a lock. For remove and replace the bucket cannot be * interesting and doesn't need locking. * For insert we allocate the bucket if this is the last bucket_table, * and then take the lock. * Sometimes we unlock a bucket by writing a new pointer there. In that * case we don't need to unlock, but we do need to reset state such as * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() * provides the same release semantics that bit_spin_unlock() provides, * this is safe. * When we write to a bucket without unlocking, we use rht_assign_locked(). */ static inline unsigned long rht_lock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); return flags; } static inline unsigned long rht_lock_nested(struct bucket_table *tbl, struct rhash_lock_head __rcu **bucket, unsigned int subclass) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); return flags; } static inline void rht_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, unsigned long flags) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); local_irq_restore(flags); } static inline struct rhash_head *__rht_ptr( struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) { return (struct rhash_head *) ((unsigned long)p & ~BIT(0) ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* * Where 'bkt' is a bucket and might be locked: * rht_ptr_rcu() dereferences that pointer and clears the lock bit. * rht_ptr() dereferences in a context where the bucket is locked. * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference(*bkt), bkt); } static inline struct rhash_head *rht_ptr( struct rhash_lock_head __rcu *const *bkt, struct bucket_table *tbl, unsigned int hash) { return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { if (rht_is_a_nulls(obj)) obj = NULL; rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); } static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, struct rhash_head *obj, unsigned long flags) { if (rht_is_a_nulls(obj)) obj = NULL; lock_map_release(&tbl->dep_map); rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); local_irq_restore(flags); } /** * rht_for_each_from - iterate over hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each_from(pos, head, tbl, hash) \ for (pos = head; \ !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each - iterate over hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash) /** * rht_for_each_entry_from - iterate over hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ for (pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each_entry - iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ rht_for_each_entry_from(tpos, pos, \ rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash, member) /** * rht_for_each_entry_safe - safely iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @next: the &struct rhash_head to use as next in loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = next, \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** * rht_for_each_rcu_from - iterate over rcu hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ for (({barrier(); }), \ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) /** * rht_for_each_entry_rcu - iterate over rcu hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ rht_for_each_entry_rcu_from(tpos, pos, \ rht_ptr_rcu(rht_bucket(tbl, hash)), \ tbl, hash, member) /** * rhl_for_each_rcu - iterate over rcu hash table list * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_rcu(pos, list) \ for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) /** * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * @member: name of the &struct rlist_head within the hashable struct. * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_entry_rcu(tpos, pos, list, member) \ for (pos = list; pos && rht_entry(tpos, pos, member); \ pos = rcu_dereference_raw(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) { struct rhashtable *ht = arg->ht; const char *ptr = obj; return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } /* Internal function, do not use. */ static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu *const *bkt; struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) continue; return he; } /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; return NULL; } /** * rhashtable_lookup - search hash table * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. The first matching entry is returned. * * This must only be called under the RCU read lock. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(ht, key, params); return he ? rht_obj(ht, he) : NULL; } /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. The first matching entry is returned. * * Only use this function when you have other mechanisms guaranteeing * that the object won't go away after the RCU read lock is released. * * Returns the first entry on which the compare function returned true. */ static inline void *rhashtable_lookup_fast( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { void *obj; rcu_read_lock(); obj = rhashtable_lookup(ht, key, params); rcu_read_unlock(); return obj; } /** * rhltable_lookup - search hash list table * @hlt: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for a entry with an identical key. All matching entries are returned * in a list. * * This must only be called under the RCU read lock. * * Returns the list of entries that match the given key. */ static inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes in there is a clash, * otherwise it returns an error via ERR_PTR(). */ static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; unsigned long flags; unsigned int hash; int elasticity; void *data; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); hash = rht_head_hashfn(ht, tbl, obj, params); elasticity = RHT_ELASTICITY; bkt = rht_bucket_insert(ht, tbl, hash); data = ERR_PTR(-ENOMEM); if (!bkt) goto out; pprev = NULL; flags = rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: rht_unlock(tbl, bkt, flags); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; elasticity--; if (!key || (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } data = rht_obj(ht, head); if (!rhlist) goto out_unlock; list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else rht_assign_unlock(tbl, bkt, obj, flags); data = NULL; goto out; } if (elasticity <= 0) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; /* Inserting at head of list makes unlocking free. */ head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } atomic_inc(&ht->nelems); rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); data = NULL; out: rcu_read_unlock(); return data; out_unlock: rht_unlock(tbl, bkt, flags); goto out; } /** * rhashtable_insert_fast - insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhltable_insert_key - insert object into hash list table * @hlt: hash list table * @key: the pointer to the key * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, const struct rhashtable_params params) { return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, params, true)); } /** * rhltable_insert - insert object into hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { const char *key = rht_obj(&hlt->ht, &list->rhead); key += params.key_offset; return rhltable_insert_key(hlt, key, list, params); } /** * rhashtable_lookup_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * This lookup function may only be used for fixed key hash table (key_len * parameter set). It will BUG() if used inappropriately. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); void *ret; BUG_ON(ht->p.obj_hashfn); ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_fast(), but this function returns the * object if it exists, NULL if it did not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); BUG_ON(ht->p.obj_hashfn); return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); } /** * rhashtable_lookup_insert_key - search and insert object to hash table * with explicit key * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Lookups may occur in parallel with hashtable mutations and resizing. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. * * Returns zero on success. */ static inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; BUG_ON(!ht->p.obj_hashfn || !key); ret = __rhashtable_insert_fast(ht, key, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_key - lookup and insert object into hash table * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_key(), but this function returns the * object if it exists, NULL if it does not and the insertion was successful, * and an ERR_PTR otherwise. */ static inline void *rhashtable_lookup_get_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { BUG_ON(!ht->p.obj_hashfn || !key); return __rhashtable_insert_fast(ht, key, obj, params, false); } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); if (he != obj) { struct rhlist_head __rcu **lpprev; pprev = &he->next; if (!rhlist) continue; do { lpprev = &list->next; list = rht_dereference_bucket(list->next, tbl, hash); } while (list && obj != &list->rhead); if (!list) continue; list = rht_dereference_bucket(list->next, tbl, hash); RCU_INIT_POINTER(*lpprev, list); err = 0; break; } obj = rht_dereference_bucket(obj->next, tbl, hash); err = 1; if (rhlist) { list = rht_dereference_bucket(list->next, tbl, hash); if (list) { RCU_INIT_POINTER(list->rhead.next, obj); obj = &list->rhead; err = 0; } } if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj, flags); } goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: if (err > 0) { atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))) schedule_work(&ht->run_work); err = 0; } return err; } /* Internal function, please use rhashtable_remove_fast() instead */ static inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, rhlist)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhashtable_remove_fast - remove object from hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30%. * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { return __rhashtable_remove_fast(ht, obj, params, false); } /** * rhltable_remove - remove object from hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30% * * Returns zero on success, -ENOENT if the entry could not be found. */ static inline int rhltable_remove( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); } /* Internal function, please use rhashtable_replace_fast() instead */ static inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; /* Minimally, the old and new objects must have same hash * (which should mean identifiers are the same). */ hash = rht_head_hashfn(ht, tbl, obj_old, params); if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) return -EINVAL; bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; } rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj_new, flags); } err = 0; goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: return err; } /** * rhashtable_replace_fast - replace an object in hash table * @ht: hash table * @obj_old: pointer to hash head inside object being replaced * @obj_new: pointer to hash head inside object which is new * @params: hash table parameters * * Replacing an object doesn't affect the number of elements in the hash table * or bucket, so we don't need to worry about shrinking or expanding the * table here. * * Returns zero on success, -ENOENT if the entry could not be found, * -EINVAL if hash is not the same for the old and new objects. */ static inline int rhashtable_replace_fast( struct rhashtable *ht, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, obj_new, params)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhltable_walk_enter - Initialise an iterator * @hlt: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptable context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ static inline void rhltable_walk_enter(struct rhltable *hlt, struct rhashtable_iter *iter) { return rhashtable_walk_enter(&hlt->ht, iter); } /** * rhltable_free_and_destroy - free elements and destroy hash list table * @hlt: the hash list table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * See documentation for rhashtable_free_and_destroy. */ static inline void rhltable_free_and_destroy(struct rhltable *hlt, void (*free_fn)(void *ptr, void *arg), void *arg) { return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); } static inline void rhltable_destroy(struct rhltable *hlt) { return rhltable_free_and_destroy(hlt, NULL, NULL); } #endif /* _LINUX_RHASHTABLE_H */
5224 17535 14203 14203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_CONTEXT_H #define __X86_KERNEL_FPU_CONTEXT_H #include <asm/fpu/xstate.h> #include <asm/trace/fpu.h> /* Functions related to FPU context tracking */ /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Invalidate a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } static inline void fpregs_deactivate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { struct fpu *fpu = &current->thread.fpu; int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { /* * This restores _all_ xstate which has not been * established yet. * * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu.xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); fpregs_activate(fpu); fpu->last_cpu = cpu; } clear_thread_flag(TIF_NEED_FPU_LOAD); } #endif
1304 1304 6586 6585 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 // SPDX-License-Identifier: GPL-2.0 /* * IA-32 Huge TLB Page Support for Kernel. * * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> */ #include <linux/init.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/compat.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/elf.h> /* * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. * Otherwise, returns 0. */ int pmd_huge(pmd_t pmd) { return !pmd_none(pmd) && (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } /* * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. * Otherwise, returns 0. */ int pud_huge(pud_t pud) { #if CONFIG_PGTABLE_LEVELS > 2 return !pud_none(pud) && (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; #else return 0; #endif } #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ info.high_limit = in_32bit_syscall() ? task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area * in the full address space. */ if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (addr & ~PAGE_MASK) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } return addr; } unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; /* No address checking. See comment at mmap_address_hint_valid() */ if (flags & MAP_FIXED) { if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } if (addr) { addr &= huge_page_mask(h); if (!mmap_address_hint_valid(addr, len)) goto get_unmapped_area; vma = find_vma(mm, addr); if (!vma || addr + len <= vm_start_gap(vma)) return addr; } get_unmapped_area: if (mm->get_unmapped_area == arch_get_unmapped_area) return hugetlb_get_unmapped_area_bottomup(file, addr, len, pgoff, flags); else return hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); } #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 bool __init arch_hugetlb_valid_size(unsigned long size) { if (size == PMD_SIZE) return true; else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) return true; else return false; } #ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ if (boot_cpu_has(X86_FEATURE_GBPAGES)) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); return 0; } arch_initcall(gigantic_pages_init); #endif #endif
51 11 51 51 51 51 34 8 34 34 34 81 81 81 134 125 119 123 122 121 148 148 124 119 88 28 27 60 8 12 12 5 32 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) 2007 Alan Stern * Copyright (C) 2009 IBM Corporation * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> * * Authors: Alan Stern <stern@rowland.harvard.edu> * K.Prasad <prasad@linux.vnet.ibm.com> * Frederic Weisbecker <fweisbec@gmail.com> */ /* * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, * using the CPU's debug registers. */ #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/irqflags.h> #include <linux/notifier.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/percpu.h> #include <linux/kdebug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/smp.h> #include <asm/hw_breakpoint.h> #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/user.h> #include <asm/desc.h> #include <asm/tlbflush.h> /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); EXPORT_PER_CPU_SYMBOL(cpu_dr7); /* Per cpu debug address registers values */ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); /* * Stores the breakpoints currently in use on each breakpoint address * register for each cpus */ static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); static inline unsigned long __encode_dr7(int drnum, unsigned int len, unsigned int type) { unsigned long bp_info; bp_info = (len | type) & 0xf; bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); return bp_info; } /* * Encode the length, type, Exact, and Enable bits for a particular breakpoint * as stored in debug register 7. */ unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) { return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; } /* * Decode the length and type bits for a particular breakpoint as * stored in debug register 7. Return the "enabled" status. */ int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) { int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); *len = (bp_info & 0xc) | 0x40; *type = (bp_info & 0x3) | 0x80; return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } /* * Install a perf counter breakpoint. * * We seek a free debug address register and use it for this * breakpoint. Eventually we enable it in the debug control register. * * Atomic: we hold the counter->ctx->lock and we only handle variables * and registers local to this cpu. */ int arch_install_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned long *dr7; int i; lockdep_assert_irqs_disabled(); for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); if (!*slot) { *slot = bp; break; } } if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return -EBUSY; set_debugreg(info->address, i); __this_cpu_write(cpu_debugreg[i], info->address); dr7 = this_cpu_ptr(&cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); /* * Ensure we first write cpu_dr7 before we set the DR7 register. * This ensures an NMI never see cpu_dr7 0 when DR7 is not. */ barrier(); set_debugreg(*dr7, 7); if (info->mask) amd_set_dr_addr_mask(info->mask, i); return 0; } /* * Uninstall the breakpoint contained in the given counter. * * First we search the debug address register it uses and then we disable * it. * * Atomic: we hold the counter->ctx->lock and we only handle variables * and registers local to this cpu. */ void arch_uninstall_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned long dr7; int i; lockdep_assert_irqs_disabled(); for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); if (*slot == bp) { *slot = NULL; break; } } if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; dr7 = this_cpu_read(cpu_dr7); dr7 &= ~__encode_dr7(i, info->len, info->type); set_debugreg(dr7, 7); if (info->mask) amd_set_dr_addr_mask(0, i); /* * Ensure the write to cpu_dr7 is after we've set the DR7 register. * This ensures an NMI never see cpu_dr7 0 when DR7 is not. */ barrier(); this_cpu_write(cpu_dr7, dr7); } static int arch_bp_generic_len(int x86_len) { switch (x86_len) { case X86_BREAKPOINT_LEN_1: return HW_BREAKPOINT_LEN_1; case X86_BREAKPOINT_LEN_2: return HW_BREAKPOINT_LEN_2; case X86_BREAKPOINT_LEN_4: return HW_BREAKPOINT_LEN_4; #ifdef CONFIG_X86_64 case X86_BREAKPOINT_LEN_8: return HW_BREAKPOINT_LEN_8; #endif default: return -EINVAL; } } int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { int len; /* Type */ switch (x86_type) { case X86_BREAKPOINT_EXECUTE: if (x86_len != X86_BREAKPOINT_LEN_X) return -EINVAL; *gen_type = HW_BREAKPOINT_X; *gen_len = sizeof(long); return 0; case X86_BREAKPOINT_WRITE: *gen_type = HW_BREAKPOINT_W; break; case X86_BREAKPOINT_RW: *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; break; default: return -EINVAL; } /* Len */ len = arch_bp_generic_len(x86_len); if (len < 0) return -EINVAL; *gen_len = len; return 0; } /* * Check for virtual address in kernel space. */ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { unsigned long va; int len; va = hw->address; len = arch_bp_generic_len(hw->len); WARN_ON_ONCE(len < 0); /* * We don't need to worry about va + len - 1 overflowing: * we already require that va is aligned to a multiple of len. */ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } /* * Checks whether the range [addr, end], overlaps the area [base, base + size). */ static inline bool within_area(unsigned long addr, unsigned long end, unsigned long base, unsigned long size) { return end >= base && addr < (base + size); } /* * Checks whether the range from addr to end, inclusive, overlaps the fixed * mapped CPU entry area range or other ranges used for CPU entry. */ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) { int cpu; /* CPU entry erea is always used for CPU entry */ if (within_area(addr, end, CPU_ENTRY_AREA_BASE, CPU_ENTRY_AREA_MAP_SIZE)) return true; /* * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. */ #ifdef CONFIG_SMP if (within_area(addr, end, (unsigned long)__per_cpu_offset, sizeof(unsigned long) * nr_cpu_ids)) return true; #else if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, sizeof(pcpu_unit_offsets))) return true; #endif for_each_possible_cpu(cpu) { /* The original rw GDT is being used after load_direct_gdt() */ if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), GDT_SIZE)) return true; /* * cpu_tss_rw is not directly referenced by hardware, but * cpu_tss_rw is also used in CPU entry code, */ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct))) return true; /* * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. * If a data breakpoint on it, it will cause an unwanted #DB. * Protect the full cpu_tlbstate structure to be sure. */ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_tlbstate, cpu), sizeof(struct tlb_state))) return true; /* * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() * will read per-cpu cpu_dr7 before clear dr7 register. */ if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), sizeof(cpu_dr7))) return true; } return false; } static int arch_build_bp_info(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { unsigned long bp_end; bp_end = attr->bp_addr + attr->bp_len - 1; if (bp_end < attr->bp_addr) return -EINVAL; /* * Prevent any breakpoint of any type that overlaps the CPU * entry area and data. This protects the IST stacks and also * reduces the chance that we ever find out what happens if * there's a data breakpoint on the GDT, IDT, or TSS. */ if (within_cpu_entry(attr->bp_addr, bp_end)) return -EINVAL; hw->address = attr->bp_addr; hw->mask = 0; /* Type */ switch (attr->bp_type) { case HW_BREAKPOINT_W: hw->type = X86_BREAKPOINT_WRITE; break; case HW_BREAKPOINT_W | HW_BREAKPOINT_R: hw->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: /* * We don't allow kernel breakpoints in places that are not * acceptable for kprobes. On non-kprobes kernels, we don't * allow kernel breakpoints at all. */ if (attr->bp_addr >= TASK_SIZE_MAX) { if (within_kprobe_blacklist(attr->bp_addr)) return -EINVAL; } hw->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. * But we still need to check userspace is not trying to setup * an unsupported length, to get a range breakpoint for example. */ if (attr->bp_len == sizeof(long)) { hw->len = X86_BREAKPOINT_LEN_X; return 0; } fallthrough; default: return -EINVAL; } /* Len */ switch (attr->bp_len) { case HW_BREAKPOINT_LEN_1: hw->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: hw->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: hw->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: hw->len = X86_BREAKPOINT_LEN_8; break; #endif default: /* AMD range breakpoint */ if (!is_power_of_2(attr->bp_len)) return -EINVAL; if (attr->bp_addr & (attr->bp_len - 1)) return -EINVAL; if (!boot_cpu_has(X86_FEATURE_BPEXT)) return -EOPNOTSUPP; /* * It's impossible to use a range breakpoint to fake out * user vs kernel detection because bp_len - 1 can't * have the high bit set. If we ever allow range instruction * breakpoints, then we'll have to check for kprobe-blacklisted * addresses anywhere in the range. */ hw->mask = attr->bp_len - 1; hw->len = X86_BREAKPOINT_LEN_1; } return 0; } /* * Validate the arch-specific HW Breakpoint register settings */ int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { unsigned int align; int ret; ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; switch (hw->len) { case X86_BREAKPOINT_LEN_1: align = 0; if (hw->mask) align = hw->mask; break; case X86_BREAKPOINT_LEN_2: align = 1; break; case X86_BREAKPOINT_LEN_4: align = 3; break; #ifdef CONFIG_X86_64 case X86_BREAKPOINT_LEN_8: align = 7; break; #endif default: WARN_ON_ONCE(1); return -EINVAL; } /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ if (hw->address & align) return -EINVAL; return 0; } /* * Release the user breakpoints used by ptrace */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { int i; struct thread_struct *t = &tsk->thread; for (i = 0; i < HBP_NUM; i++) { unregister_hw_breakpoint(t->ptrace_bps[i]); t->ptrace_bps[i] = NULL; } t->virtual_dr6 = 0; t->ptrace_dr7 = 0; } void hw_breakpoint_restore(void) { set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); set_debugreg(DR6_RESERVED, 6); set_debugreg(__this_cpu_read(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); /* * Handle debug exception notifications. * * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. * * NOTIFY_DONE returned if one of the following conditions is true. * i) When the causative address is from user-space and the exception * is a valid one, i.e. not triggered as a result of lazy debug register * switching * ii) When there are more bits than trap<n> set in DR6 register (such * as BD, BS or BT) indicating that more than one debug condition is * met and requires some more action in do_debug(). * * NOTIFY_STOP returned for all other cases * */ static int hw_breakpoint_handler(struct die_args *args) { int i, rc = NOTIFY_STOP; struct perf_event *bp; unsigned long *dr6_p; unsigned long dr6; bool bpx; /* The DR6 value is pointed by args->err */ dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; /* Handle all the breakpoints that were triggered */ for (i = 0; i < HBP_NUM; ++i) { if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; bp = this_cpu_read(bp_per_reg[i]); if (!bp) continue; bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; /* * TF and data breakpoints are traps and can be merged, however * instruction breakpoints are faults and will be raised * separately. * * However DR6 can indicate both TF and instruction * breakpoints. In that case take TF as that has precedence and * delay the instruction breakpoint for the next exception. */ if (bpx && (dr6 & DR_STEP)) continue; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling */ (*dr6_p) &= ~(DR_TRAP0 << i); perf_bp_event(bp, args->regs); /* * Set up resume flag to avoid breakpoint recursion when * returning back to origin. */ if (bpx) args->regs->flags |= X86_EFLAGS_RF; } /* * Further processing in do_debug() is needed for a) user-space * breakpoints (to generate signals) and b) when the system has * taken exception due to multiple causes */ if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; return rc; } /* * Handle debug exception notifications. */ int hw_breakpoint_exceptions_notify( struct notifier_block *unused, unsigned long val, void *data) { if (val != DIE_DEBUG) return NOTIFY_DONE; return hw_breakpoint_handler(data); } void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEAD: Authenticated Encryption with Associated Data * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_AEAD_H #define _CRYPTO_AEAD_H #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/crypto.h> #include <linux/slab.h> #include <linux/types.h> /** * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API * * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD * (listed as type "aead" in /proc/crypto) * * The most prominent examples for this type of encryption is GCM and CCM. * However, the kernel supports other types of AEAD ciphers which are defined * with the following cipher string: * * authenc(keyed message digest, block cipher) * * For example: authenc(hmac(sha256), cbc(aes)) * * The example code provided for the symmetric key cipher operation applies * here as well. Naturally all *skcipher* symbols must be exchanged the *aead* * pendants discussed in the following. In addition, for the AEAD operation, * the aead_request_set_ad function must be used to set the pointer to the * associated data memory location before performing the encryption or * decryption operation. Another deviation from the asynchronous block cipher * operation is that the caller should explicitly check for -EBADMSG of the * crypto_aead_decrypt. That error indicates an authentication error, i.e. * a breach in the integrity of the message. In essence, that -EBADMSG error * code is the key bonus an AEAD cipher has over "standard" block chaining * modes. * * Memory Structure: * * The source scatterlist must contain the concatenation of * associated data || plaintext or ciphertext. * * The destination scatterlist has the same layout, except that the plaintext * (resp. ciphertext) will grow (resp. shrink) by the authentication tag size * during encryption (resp. decryption). The authentication tag is generated * during the encryption operation and appended to the ciphertext. During * decryption, the authentication tag is consumed along with the ciphertext and * used to verify the integrity of the plaintext and the associated data. * * In-place encryption/decryption is enabled by using the same scatterlist * pointer for both the source and destination. * * Even in the out-of-place case, space must be reserved in the destination for * the associated data, even though it won't be written to. This makes the * in-place and out-of-place cases more consistent. It is permissible for the * "destination" associated data to alias the "source" associated data. * * As with the other scatterlist crypto APIs, zero-length scatterlist elements * are not allowed in the used part of the scatterlist. Thus, if there is no * associated data, the first element must point to the plaintext/ciphertext. * * To meet the needs of IPsec, a special quirk applies to rfc4106, rfc4309, * rfc4543, and rfc7539esp ciphers. For these ciphers, the final 'ivsize' bytes * of the associated data buffer must contain a second copy of the IV. This is * in addition to the copy passed to aead_request_set_crypt(). These two IV * copies must not differ; different implementations of the same algorithm may * behave differently in that case. Note that the algorithm might not actually * treat the IV as associated data; nevertheless the length passed to * aead_request_set_ad() must include it. */ struct crypto_aead; struct scatterlist; /** * struct aead_request - AEAD request * @base: Common attributes for async crypto requests * @assoclen: Length in bytes of associated data for authentication * @cryptlen: Length of data to be encrypted or decrypted * @iv: Initialisation vector * @src: Source data * @dst: Destination data * @__ctx: Start of private context data */ struct aead_request { struct crypto_async_request base; unsigned int assoclen; unsigned int cryptlen; u8 *iv; struct scatterlist *src; struct scatterlist *dst; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /* * struct crypto_istat_aead - statistics for AEAD algorithm * @encrypt_cnt: number of encrypt requests * @encrypt_tlen: total data size handled by encrypt requests * @decrypt_cnt: number of decrypt requests * @decrypt_tlen: total data size handled by decrypt requests * @err_cnt: number of error for AEAD requests */ struct crypto_istat_aead { atomic64_t encrypt_cnt; atomic64_t encrypt_tlen; atomic64_t decrypt_cnt; atomic64_t decrypt_tlen; atomic64_t err_cnt; }; /** * struct aead_alg - AEAD cipher definition * @maxauthsize: Set the maximum authentication tag size supported by the * transformation. A transformation may support smaller tag sizes. * As the authentication tag is a message digest to ensure the * integrity of the encrypted data, a consumer typically wants the * largest authentication tag possible as defined by this * variable. * @setauthsize: Set authentication size for the AEAD transformation. This * function is used to specify the consumer requested size of the * authentication tag to be either generated by the transformation * during encryption or the size of the authentication tag to be * supplied during the decryption operation. This function is also * responsible for checking the authentication tag size for * validity. * @setkey: see struct skcipher_alg * @encrypt: see struct skcipher_alg * @decrypt: see struct skcipher_alg * @stat: statistics for AEAD algorithm * @ivsize: see struct skcipher_alg * @chunksize: see struct skcipher_alg * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @base: Definition of a generic crypto cipher algorithm. * * All fields except @ivsize is mandatory and must be filled. */ struct aead_alg { int (*setkey)(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize); int (*encrypt)(struct aead_request *req); int (*decrypt)(struct aead_request *req); int (*init)(struct crypto_aead *tfm); void (*exit)(struct crypto_aead *tfm); #ifdef CONFIG_CRYPTO_STATS struct crypto_istat_aead stat; #endif unsigned int ivsize; unsigned int maxauthsize; unsigned int chunksize; struct crypto_alg base; }; struct crypto_aead { unsigned int authsize; unsigned int reqsize; struct crypto_tfm base; }; static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_aead, base); } /** * crypto_alloc_aead() - allocate AEAD cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * AEAD cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an AEAD. The returned struct * crypto_aead is the cipher handle that is required for any subsequent * API invocation for that AEAD. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm) { return &tfm->base; } /** * crypto_free_aead() - zeroize and free aead handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_aead(struct crypto_aead *tfm) { crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm)); } /** * crypto_has_aead() - Search for the availability of an aead. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * aead * @type: specifies the type of the aead * @mask: specifies the mask for the aead * * Return: true when the aead is known to the kernel crypto API; false * otherwise */ int crypto_has_aead(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_aead_driver_name(struct crypto_aead *tfm) { return crypto_tfm_alg_driver_name(crypto_aead_tfm(tfm)); } static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm) { return container_of(crypto_aead_tfm(tfm)->__crt_alg, struct aead_alg, base); } static inline unsigned int crypto_aead_alg_ivsize(struct aead_alg *alg) { return alg->ivsize; } /** * crypto_aead_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the aead referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm) { return crypto_aead_alg_ivsize(crypto_aead_alg(tfm)); } /** * crypto_aead_authsize() - obtain maximum authentication data size * @tfm: cipher handle * * The maximum size of the authentication data for the AEAD cipher referenced * by the AEAD cipher handle is returned. The authentication data size may be * zero if the cipher implements a hard-coded maximum. * * The authentication data may also be known as "tag value". * * Return: authentication data size / tag size in bytes */ static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm) { return tfm->authsize; } static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg) { return alg->maxauthsize; } static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead) { return crypto_aead_alg_maxauthsize(crypto_aead_alg(aead)); } /** * crypto_aead_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the AEAD referenced with the cipher handle is returned. * The caller may use that information to allocate appropriate memory for the * data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm) { return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm)); } static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm) { return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm)); } static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm) { return crypto_tfm_get_flags(crypto_aead_tfm(tfm)); } static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags) { crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags); } static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags); } /** * crypto_aead_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the AEAD referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_aead_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); /** * crypto_aead_setauthsize() - set authentication data size * @tfm: cipher handle * @authsize: size of the authentication data / tag in bytes * * Set the authentication data size / tag size. AEAD requires an authentication * tag (or MAC) in addition to the associated data. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize); static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req) { return __crypto_aead_cast(req->base.tfm); } /** * crypto_aead_encrypt() - encrypt plaintext * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Encrypt plaintext data using the aead_request handle. That data structure * and how it is filled with data is discussed with the aead_request_* * functions. * * IMPORTANT NOTE The encryption operation creates the authentication data / * tag. That data is concatenated with the created ciphertext. * The ciphertext memory size is therefore the given number of * block cipher blocks + the size defined by the * crypto_aead_setauthsize invocation. The caller must ensure * that sufficient memory is available for the ciphertext and * the authentication tag. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_aead_encrypt(struct aead_request *req); /** * crypto_aead_decrypt() - decrypt ciphertext * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the aead_request handle. That data structure * and how it is filled with data is discussed with the aead_request_* * functions. * * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the * authentication data / tag. That authentication data / tag * must have the size defined by the crypto_aead_setauthsize * invocation. * * * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD * cipher operation performs the authentication of the data during the * decryption operation. Therefore, the function returns this error if * the authentication of the ciphertext was unsuccessful (i.e. the * integrity of the ciphertext or the associated data was violated); * < 0 if an error occurred. */ int crypto_aead_decrypt(struct aead_request *req); /** * DOC: Asynchronous AEAD Request Handle * * The aead_request data structure contains all pointers to data required for * the AEAD cipher operation. This includes the cipher handle (which can be * used by multiple aead_request instances), pointer to plaintext and * ciphertext, asynchronous callback function, etc. It acts as a handle to the * aead_request_* API calls in a similar way as AEAD handle to the * crypto_aead_* API calls. */ /** * crypto_aead_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: number of bytes */ static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm) { return tfm->reqsize; } /** * aead_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing aead handle in the request * data structure with a different one. */ static inline void aead_request_set_tfm(struct aead_request *req, struct crypto_aead *tfm) { req->base.tfm = crypto_aead_tfm(tfm); } /** * aead_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the AEAD * encrypt and decrypt API calls. During the allocation, the provided aead * handle is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm, gfp_t gfp) { struct aead_request *req; req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp); if (likely(req)) aead_request_set_tfm(req, tfm); return req; } /** * aead_request_free() - zeroize and free request data structure * @req: request data structure cipher handle to be freed */ static inline void aead_request_free(struct aead_request *req) { kfree_sensitive(req); } /** * aead_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * crypto_async_request data structure provided to the callback function. * * Setting the callback function that is triggered once the cipher operation * completes * * The callback function is registered with the aead_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void aead_request_set_callback(struct aead_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * aead_request_set_crypt - set data buffers * @req: request handle * @src: source scatter / gather list * @dst: destination scatter / gather list * @cryptlen: number of bytes to process from @src * @iv: IV for the cipher operation which must comply with the IV size defined * by crypto_aead_ivsize() * * Setting the source data and destination data scatter / gather lists which * hold the associated data concatenated with the plaintext or ciphertext. See * below for the authentication tag. * * For encryption, the source is treated as the plaintext and the * destination is the ciphertext. For a decryption operation, the use is * reversed - the source is the ciphertext and the destination is the plaintext. * * The memory structure for cipher operation has the following structure: * * - AEAD encryption input: assoc data || plaintext * - AEAD encryption output: assoc data || ciphertext || auth tag * - AEAD decryption input: assoc data || ciphertext || auth tag * - AEAD decryption output: assoc data || plaintext * * Albeit the kernel requires the presence of the AAD buffer, however, * the kernel does not fill the AAD buffer in the output case. If the * caller wants to have that data buffer filled, the caller must either * use an in-place cipher operation (i.e. same memory location for * input/output memory location). */ static inline void aead_request_set_crypt(struct aead_request *req, struct scatterlist *src, struct scatterlist *dst, unsigned int cryptlen, u8 *iv) { req->src = src; req->dst = dst; req->cryptlen = cryptlen; req->iv = iv; } /** * aead_request_set_ad - set associated data information * @req: request handle * @assoclen: number of bytes in associated data * * Setting the AD information. This function sets the length of * the associated data. */ static inline void aead_request_set_ad(struct aead_request *req, unsigned int assoclen) { req->assoclen = assoclen; } #endif /* _CRYPTO_AEAD_H */
798 799 798 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JUMP_LABEL_H #define _LINUX_JUMP_LABEL_H /* * Jump label support * * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * * The use of 'struct static_key' directly, is now DEPRECATED. In addition * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following: * * struct static_key false = STATIC_KEY_INIT_FALSE; * struct static_key true = STATIC_KEY_INIT_TRUE; * static_key_true() * static_key_false() * * The updated API replacements are: * * DEFINE_STATIC_KEY_TRUE(key); * DEFINE_STATIC_KEY_FALSE(key); * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count); * static_branch_likely() * static_branch_unlikely() * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support, if we * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)", * an "if (static_branch_unlikely(&key))" statement is an unconditional branch * (which defaults to false - and the true block is placed out of line). * Similarly, we can define an initially true key via * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same * "if (static_branch_unlikely(&key))", in which case we will generate an * unconditional branch to the out-of-line true branch. Keys that are * initially true or false can be using in both static_branch_unlikely() * and static_branch_likely() statements. * * At runtime we can change the branch target by setting the key * to true via a call to static_branch_enable(), or false using * static_branch_disable(). If the direction of the branch is switched by * these calls then we run-time modify the branch target via a * no-op -> jump or jump -> no-op conversion. For example, for an * initially false key that is used in an "if (static_branch_unlikely(&key))" * statement, setting the key to true requires us to patch in a jump * to the out-of-line of true branch. * * In addition to static_branch_{enable,disable}, we can also reference count * the key or branch direction via static_branch_{inc,dec}. Thus, * static_branch_inc() can be thought of as a 'make more true' and * static_branch_dec() as a 'make more false'. * * Since this relies on modifying code, the branch modifying functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional, their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total * effect is a single NOP of appropriate size. The on case will patch in a jump * to the out-of-line block. * * When the control is directly exposed to userspace, it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) * cause significant performance degradation. Struct static_key_deferred and * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, static keys fall back to a * simple conditional branch. * * Additional babbling in: Documentation/staging/static-keys.rst */ #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/compiler.h> extern bool static_key_initialized; #define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized, \ "%s(): static key '%pS' used before call to jump_label_init()", \ __func__, (key)) struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* * Note: * To make anonymous unions work with old compilers, the static * initialization of them requires brackets. This creates a dependency * on the order of the struct with the initializers. If any fields * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need * to be modified. * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod * 0 if points to struct jump_entry */ union { unsigned long type; struct jump_entry *entries; struct static_key_mod *next; }; #endif /* CONFIG_JUMP_LABEL */ }; #endif /* __ASSEMBLY__ */ #ifdef CONFIG_JUMP_LABEL #include <asm/jump_label.h> #ifndef __ASSEMBLY__ #ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE struct jump_entry { s32 code; s32 target; long key; // key may be far away from the core kernel under KASLR }; static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return (unsigned long)&entry->code + entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return (unsigned long)&entry->target + entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { long offset = entry->key & ~3L; return (struct static_key *)((unsigned long)&entry->key + offset); } #else static inline unsigned long jump_entry_code(const struct jump_entry *entry) { return entry->code; } static inline unsigned long jump_entry_target(const struct jump_entry *entry) { return entry->target; } static inline struct static_key *jump_entry_key(const struct jump_entry *entry) { return (struct static_key *)((unsigned long)entry->key & ~3UL); } #endif static inline bool jump_entry_is_branch(const struct jump_entry *entry) { return (unsigned long)entry->key & 1UL; } static inline bool jump_entry_is_init(const struct jump_entry *entry) { return (unsigned long)entry->key & 2UL; } static inline void jump_entry_set_init(struct jump_entry *entry, bool set) { if (set) entry->key |= 2; else entry->key &= ~2; } static inline int jump_entry_size(struct jump_entry *entry) { #ifdef JUMP_LABEL_NOP_SIZE return JUMP_LABEL_NOP_SIZE; #else return arch_jump_entry_size(entry); #endif } #endif #endif #ifndef __ASSEMBLY__ enum jump_label_type { JUMP_LABEL_NOP = 0, JUMP_LABEL_JMP, }; struct module; #ifdef CONFIG_JUMP_LABEL #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_LINKED 2UL #define JUMP_TYPE_MASK 3UL static __always_inline bool static_key_false(struct static_key *key) { return arch_static_branch(key, false); } static __always_inline bool static_key_true(struct static_key *key) { return !arch_static_branch(key, true); } extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); extern bool static_key_slow_inc(struct static_key *key); extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); extern void static_key_disable(struct static_key *key); extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); /* * We should be using ATOMIC_INIT() for initializing .enabled, but * the inclusion of atomic.h is problematic for inclusion of jump_label.h * in 'low-level' headers. Thus, we are initializing .enabled with a * raw value, but have added a BUILD_BUG_ON() to catch any issues in * jump_label_init() see: kernel/jump_label.c. */ #define STATIC_KEY_INIT_TRUE \ { .enabled = { 1 }, \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ { .enabled = { 0 }, \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ #include <linux/atomic.h> #include <linux/bug.h> static __always_inline int static_key_count(struct static_key *key) { return raw_atomic_read(&key->enabled); } static __always_inline void jump_label_init(void) { static_key_initialized = true; } static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) return true; return false; } static __always_inline bool static_key_true(struct static_key *key) { if (likely_notrace(static_key_count(key) > 0)) return true; return false; } static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Prevent key->enabled getting negative to follow the same semantics * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. */ v = atomic_read(&key->enabled); do { if (v < 0 || (v + 1) < 0) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } #define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); atomic_dec(&key->enabled); } #define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key) #define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key) static inline int jump_label_text_reserved(void *start, void *end) { return 0; } static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} static inline void static_key_enable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } atomic_set(&key->enabled, 1); } static inline void static_key_disable(struct static_key *key) { STATIC_KEY_CHECK_USE(key); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } atomic_set(&key->enabled, 0); } #define static_key_enable_cpuslocked(k) static_key_enable((k)) #define static_key_disable_cpuslocked(k) static_key_disable((k)) #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } #endif /* CONFIG_JUMP_LABEL */ #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled /* -------------------------------------------------------------------------- */ /* * Two type wrappers around static_key, such that we can use compile time * type differentiation to emit the right code. * * All the below code is macros in order to play type games. */ struct static_key_true { struct static_key key; }; struct static_key_false { struct static_key key; }; #define STATIC_KEY_TRUE_INIT (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE, } #define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, } #define DEFINE_STATIC_KEY_TRUE(name) \ struct static_key_true name = STATIC_KEY_TRUE_INIT #define DEFINE_STATIC_KEY_TRUE_RO(name) \ struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT #define DECLARE_STATIC_KEY_TRUE(name) \ extern struct static_key_true name #define DEFINE_STATIC_KEY_FALSE(name) \ struct static_key_false name = STATIC_KEY_FALSE_INIT #define DEFINE_STATIC_KEY_FALSE_RO(name) \ struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT #define DECLARE_STATIC_KEY_FALSE(name) \ extern struct static_key_false name #define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count) \ struct static_key_true name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT, \ } #define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count) \ struct static_key_false name[count] = { \ [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT, \ } #define _DEFINE_STATIC_KEY_1(name) DEFINE_STATIC_KEY_TRUE(name) #define _DEFINE_STATIC_KEY_0(name) DEFINE_STATIC_KEY_FALSE(name) #define DEFINE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name) #define _DEFINE_STATIC_KEY_RO_1(name) DEFINE_STATIC_KEY_TRUE_RO(name) #define _DEFINE_STATIC_KEY_RO_0(name) DEFINE_STATIC_KEY_FALSE_RO(name) #define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name) \ __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name) #define _DECLARE_STATIC_KEY_1(name) DECLARE_STATIC_KEY_TRUE(name) #define _DECLARE_STATIC_KEY_0(name) DECLARE_STATIC_KEY_FALSE(name) #define DECLARE_STATIC_KEY_MAYBE(cfg, name) \ __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name) extern bool ____wrong_branch_error(void); #define static_key_enabled(x) \ ({ \ if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \ !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\ !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ ____wrong_branch_error(); \ static_key_count((struct static_key *)x) > 0; \ }) #ifdef CONFIG_JUMP_LABEL /* * Combine the right initial value (type) with the right branch order * to generate the desired result. * * * type\branch| likely (1) | unlikely (0) * -----------+-----------------------+------------------ * | | * true (1) | ... | ... * | NOP | JMP L * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * | | * false (0) | ... | ... * | JMP L | NOP * | <br-stmts> | 1: ... * | L: ... | * | | * | | L: <br-stmts> * | | jmp 1b * | | * -----------+-----------------------+------------------ * * The initial value is encoded in the LSB of static_key::entries, * type: 0 = false, 1 = true. * * The branch type is encoded in the LSB of jump_entry::key, * branch: 0 = unlikely, 1 = likely. * * This gives the following logic table: * * enabled type branch instuction * -----------------------------+----------- * 0 0 0 | NOP * 0 0 1 | JMP * 0 1 0 | NOP * 0 1 1 | JMP * * 1 0 0 | JMP * 1 0 1 | NOP * 1 1 0 | JMP * 1 1 1 | NOP * * Which gives the following functions: * * dynamic: instruction = enabled ^ branch * static: instruction = type ^ branch * * See jump_label_type() / jump_label_init_type(). */ #define static_branch_likely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = !arch_static_branch(&(x)->key, true); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = !arch_static_branch_jump(&(x)->key, true); \ else \ branch = ____wrong_branch_error(); \ likely_notrace(branch); \ }) #define static_branch_unlikely(x) \ ({ \ bool branch; \ if (__builtin_types_compatible_p(typeof(*x), struct static_key_true)) \ branch = arch_static_branch_jump(&(x)->key, false); \ else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \ branch = arch_static_branch(&(x)->key, false); \ else \ branch = ____wrong_branch_error(); \ unlikely_notrace(branch); \ }) #else /* !CONFIG_JUMP_LABEL */ #define static_branch_likely(x) likely_notrace(static_key_enabled(&(x)->key)) #define static_branch_unlikely(x) unlikely_notrace(static_key_enabled(&(x)->key)) #endif /* CONFIG_JUMP_LABEL */ #define static_branch_maybe(config, x) \ (IS_ENABLED(config) ? static_branch_likely(x) \ : static_branch_unlikely(x)) /* * Advanced usage; refcount, branch is enabled when: count != 0 */ #define static_branch_inc(x) static_key_slow_inc(&(x)->key) #define static_branch_dec(x) static_key_slow_dec(&(x)->key) #define static_branch_inc_cpuslocked(x) static_key_slow_inc_cpuslocked(&(x)->key) #define static_branch_dec_cpuslocked(x) static_key_slow_dec_cpuslocked(&(x)->key) /* * Normal usage; boolean enable/disable. */ #define static_branch_enable(x) static_key_enable(&(x)->key) #define static_branch_disable(x) static_key_disable(&(x)->key) #define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key) #define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key) #endif /* __ASSEMBLY__ */ #endif /* _LINUX_JUMP_LABEL_H */
3039 3039 3038 3037 3038 3037 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 // SPDX-License-Identifier: GPL-2.0-or-later /* * The "hash function" used as the core of the ChaCha stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/bitops.h> #include <linux/string.h> #include <asm/unaligned.h> #include <crypto/chacha.h> static void chacha_permute(u32 *x, int nrounds) { int i; /* whitelist the allowed round counts */ WARN_ON_ONCE(nrounds != 20 && nrounds != 12); for (i = 0; i < nrounds; i += 2) { x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); } } /** * chacha_block_generic - generate one keystream block and increment block counter * @state: input state matrix (16 32-bit words) * @stream: output keystream block (64 bytes) * @nrounds: number of rounds (20 or 12; 20 is recommended) * * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. * The caller has already converted the endianness of the input. This function * also handles incrementing the block counter in the input matrix. */ void chacha_block_generic(u32 *state, u8 *stream, int nrounds) { u32 x[16]; int i; memcpy(x, state, 64); chacha_permute(x, nrounds); for (i = 0; i < ARRAY_SIZE(x); i++) put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); state[12]++; } EXPORT_SYMBOL(chacha_block_generic); /** * hchacha_block_generic - abbreviated ChaCha core, for XChaCha * @state: input state matrix (16 32-bit words) * @stream: output (8 32-bit words) * @nrounds: number of rounds (20 or 12; 20 is recommended) * * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha * skips the final addition of the initial state, and outputs only certain words * of the state. It should not be used for streaming directly. */ void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds) { u32 x[16]; memcpy(x, state, 64); chacha_permute(x, nrounds); memcpy(&stream[0], &x[0], 16); memcpy(&stream[4], &x[12], 16); } EXPORT_SYMBOL(hchacha_block_generic);
1091 1326 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MATH64_H #define _LINUX_MATH64_H #include <linux/types.h> #include <linux/math.h> #include <vdso/math64.h> #include <asm/div64.h> #if BITS_PER_LONG == 64 #define div64_long(x, y) div64_s64((x), (y)) #define div64_ul(x, y) div64_u64((x), (y)) /** * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * @remainder: pointer to unsigned 32bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor * * This is commonly provided by 32bit archs to provide an optimized 64bit * divide. */ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div_s64_rem - signed 64bit divide with 32bit divisor with remainder * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * @remainder: pointer to signed 32bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor */ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * @remainder: pointer to unsigned 64bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor */ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /** * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * * Return: dividend / divisor */ static inline u64 div64_u64(u64 dividend, u64 divisor) { return dividend / divisor; } /** * div64_s64 - signed 64bit divide with 64bit divisor * @dividend: signed 64bit dividend * @divisor: signed 64bit divisor * * Return: dividend / divisor */ static inline s64 div64_s64(s64 dividend, s64 divisor) { return dividend / divisor; } #elif BITS_PER_LONG == 32 #define div64_long(x, y) div_s64((x), (y)) #define div64_ul(x, y) div_u64((x), (y)) #ifndef div_u64_rem static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { *remainder = do_div(dividend, divisor); return dividend; } #endif #ifndef div_s64_rem extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); #endif #ifndef div64_u64_rem extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder); #endif #ifndef div64_u64 extern u64 div64_u64(u64 dividend, u64 divisor); #endif #ifndef div64_s64 extern s64 div64_s64(s64 dividend, s64 divisor); #endif #endif /* BITS_PER_LONG */ /** * div_u64 - unsigned 64bit divide with 32bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * * This is the most common 64bit divide and should be used if possible, * as many 32bit archs can optimize this variant better than a full 64bit * divide. * * Return: dividend / divisor */ #ifndef div_u64 static inline u64 div_u64(u64 dividend, u32 divisor) { u32 remainder; return div_u64_rem(dividend, divisor, &remainder); } #endif /** * div_s64 - signed 64bit divide with 32bit divisor * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * * Return: dividend / divisor */ #ifndef div_s64 static inline s64 div_s64(s64 dividend, s32 divisor) { s32 remainder; return div_s64_rem(dividend, divisor, &remainder); } #endif u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder); #ifndef mul_u32_u32 /* * Many a GCC version messes this up and generates a 64x64 mult :-( */ static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #endif #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } #endif /* mul_u64_u64_shr */ #else #ifndef mul_u64_u32_shr static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { u32 ah, al; u64 ret; al = a; ah = a >> 32; ret = mul_u32_u32(al, mul) >> shift; if (ah) ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) { union { u64 ll; struct { #ifdef __BIG_ENDIAN u32 high, low; #else u32 low, high; #endif } l; } rl, rm, rn, rh, a0, b0; u64 c; a0.ll = a; b0.ll = b; rl.ll = mul_u32_u32(a0.l.low, b0.l.low); rm.ll = mul_u32_u32(a0.l.low, b0.l.high); rn.ll = mul_u32_u32(a0.l.high, b0.l.low); rh.ll = mul_u32_u32(a0.l.high, b0.l.high); /* * Each of these lines computes a 64-bit intermediate result into "c", * starting at bits 32-95. The low 32-bits go into the result of the * multiplication, the high 32-bits are carried into the next step. */ rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low; rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low; rh.l.high = (c >> 32) + rh.l.high; /* * The 128-bit result of the multiplication is in rl.ll and rh.ll, * shift it right and throw away the high part of the result. */ if (shift == 0) return rl.ll; if (shift < 64) return (rl.ll >> shift) | (rh.ll << (64 - shift)); return rh.ll >> (shift & 63); } #endif /* mul_u64_u64_shr */ #endif #ifndef mul_s64_u64_shr static inline u64 mul_s64_u64_shr(s64 a, u64 b, unsigned int shift) { u64 ret; /* * Extract the sign before the multiplication and put it back * afterwards if needed. */ ret = mul_u64_u64_shr(abs(a), b, shift); if (a < 0) ret = -((s64) ret); return ret; } #endif /* mul_s64_u64_shr */ #ifndef mul_u64_u32_div static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) { union { u64 ll; struct { #ifdef __BIG_ENDIAN u32 high, low; #else u32 low, high; #endif } l; } u, rl, rh; u.ll = a; rl.ll = mul_u32_u32(u.l.low, mul); rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high; /* Bits 32-63 of the result will be in rh.l.low. */ rl.l.high = do_div(rh.ll, divisor); /* Bits 0-31 of the result will be in rl.l.low. */ do_div(rl.ll, divisor); rl.l.high = rh.l.low; return rl.ll; } #endif /* mul_u64_u32_div */ u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); /** * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up * @ll: unsigned 64bit dividend * @d: unsigned 64bit divisor * * Divide unsigned 64bit dividend by unsigned 64bit divisor * and round up. * * Return: dividend / divisor rounded up */ #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) /** * DIV64_U64_ROUND_CLOSEST - unsigned 64bit divide with 64bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * * Divide unsigned 64bit dividend by unsigned 64bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV64_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u64 _tmp = (divisor); div64_u64((dividend) + _tmp / 2, _tmp); }) /** * DIV_U64_ROUND_CLOSEST - unsigned 64bit divide with 32bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * * Divide unsigned 64bit dividend by unsigned 32bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u32 _tmp = (divisor); div_u64((u64)(dividend) + _tmp / 2, _tmp); }) /** * DIV_S64_ROUND_CLOSEST - signed 64bit divide with 32bit divisor rounded to nearest integer * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * * Divide signed 64bit dividend by signed 32bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV_S64_ROUND_CLOSEST(dividend, divisor)( \ { \ s64 __x = (dividend); \ s32 __d = (divisor); \ ((__x > 0) == (__d > 0)) ? \ div_s64((__x + (__d / 2)), __d) : \ div_s64((__x - (__d / 2)), __d); \ } \ ) #endif /* _LINUX_MATH64_H */
265 194 1952 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BITOPS_H #define _LINUX_BITOPS_H #include <asm/types.h> #include <linux/bits.h> #include <linux/typecheck.h> #include <uapi/linux/kernel.h> /* Set bits in the first 'n' bytes when loaded from memory */ #ifdef __LITTLE_ENDIAN # define aligned_byte_mask(n) ((1UL << 8*(n))-1) #else # define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) #endif #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); /* * Defined here because those may be needed by architecture-specific static * inlines. */ #include <asm-generic/bitops/generic-non-atomic.h> /* * Many architecture-specific non-atomic bitops contain inline asm code and due * to that the compiler can't optimize them to compile-time expressions or * constants. In contrary, generic_*() helpers are defined in pure C and * compilers optimize them just well. * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when * the arguments can be resolved at compile time. That expression itself is a * constant and doesn't bring any functional changes to the rest of cases. * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when * passing a bitmap from .bss or .data (-> `!!addr` is always true). */ #define bitop(op, nr, addr) \ ((__builtin_constant_p(nr) && \ __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) && \ (uintptr_t)(addr) != (uintptr_t)NULL && \ __builtin_constant_p(*(const unsigned long *)(addr))) ? \ const##op(nr, addr) : op(nr, addr)) #define __set_bit(nr, addr) bitop(___set_bit, nr, addr) #define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) #define __change_bit(nr, addr) bitop(___change_bit, nr, addr) #define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) #define test_bit(nr, addr) bitop(_test_bit, nr, addr) #define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) /* * Include this here because some architectures need generic_ffs/fls in * scope */ #include <asm/bitops.h> /* Check that the bitops prototypes are sane */ #define __check_bitop_pr(name) \ static_assert(__same_type(arch_##name, generic_##name) && \ __same_type(const_##name, generic_##name) && \ __same_type(_##name, generic_##name)) __check_bitop_pr(__set_bit); __check_bitop_pr(__clear_bit); __check_bitop_pr(__change_bit); __check_bitop_pr(__test_and_set_bit); __check_bitop_pr(__test_and_clear_bit); __check_bitop_pr(__test_and_change_bit); __check_bitop_pr(test_bit); #undef __check_bitop_pr static inline int get_bitmask_order(unsigned int count) { int order; order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } static __always_inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** * rol64 - rotate a 64-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u64 rol64(__u64 word, unsigned int shift) { return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** * ror64 - rotate a 64-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u64 ror64(__u64 word, unsigned int shift) { return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** * rol32 - rotate a 32-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u32 rol32(__u32 word, unsigned int shift) { return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** * ror32 - rotate a 32-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u32 ror32(__u32 word, unsigned int shift) { return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** * rol16 - rotate a 16-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u16 rol16(__u16 word, unsigned int shift) { return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** * ror16 - rotate a 16-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u16 ror16(__u16 word, unsigned int shift) { return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** * rol8 - rotate an 8-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u8 rol8(__u8 word, unsigned int shift) { return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** * ror8 - rotate an 8-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u8 ror8(__u8 word, unsigned int shift) { return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<32) to sign bit * * This is safe to use for 16- and 8-bit types as well. */ static __always_inline __s32 sign_extend32(__u32 value, int index) { __u8 shift = 31 - index; return (__s32)(value << shift) >> shift; } /** * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<64) to sign bit */ static __always_inline __s64 sign_extend64(__u64 value, int index) { __u8 shift = 63 - index; return (__s64)(value << shift) >> shift; } static inline unsigned fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); return fls64(l); } static inline int get_count_order(unsigned int count) { if (count == 0) return -1; return fls(--count); } /** * get_count_order_long - get order after rounding @l up to power of 2 * @l: parameter * * it is same as get_count_order() but with long type parameter */ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; return (int)fls_long(--l); } /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word * * On 64 bit arches this is a synonym for __ffs * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ static inline unsigned long __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) return __ffs((u32)(word >> 32)) + 32; #elif BITS_PER_LONG != 64 #error BITS_PER_LONG not 32 or 64 #endif return __ffs((unsigned long)word); } /** * fns - find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int bit; while (word) { bit = __ffs(word); if (n-- == 0) return bit; __clear_bit(bit, &word); } return BITS_PER_LONG; } /** * assign_bit - Assign value to a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @value: the value to assign */ static __always_inline void assign_bit(long nr, volatile unsigned long *addr, bool value) { if (value) set_bit(nr, addr); else clear_bit(nr, addr); } static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, bool value) { if (value) __set_bit(nr, addr); else __clear_bit(nr, addr); } /** * __ptr_set_bit - Set bit in a pointer's value * @nr: the bit to set * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_set_bit(bit, &p); */ #define __ptr_set_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __set_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_clear_bit - Clear bit in a pointer's value * @nr: the bit to clear * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_clear_bit(bit, &p); */ #define __ptr_clear_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __clear_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_test_bit - Test bit in a pointer's value * @nr: the bit to test * @addr: the address of the pointer variable * * Example: * void *p = foo(); * if (__ptr_test_bit(bit, &p)) { * ... * } else { * ... * } */ #define __ptr_test_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ test_bit(nr, (unsigned long *)(addr)); \ }) #ifdef __KERNEL__ #ifndef set_mask_bits #define set_mask_bits(ptr, mask, bits) \ ({ \ const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ new__ = (old__ & ~mask__) | bits__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) #endif #ifndef bit_clear_unless #define bit_clear_unless(ptr, clear, test) \ ({ \ const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ if (old__ & test__) \ break; \ new__ = old__ & ~clear__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) #endif #endif /* __KERNEL__ */ #endif
82 82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc(sizeof(*qos), GFP_KERNEL); if (!qos) return -ENOMEM; n = kzalloc(3 * sizeof(*n), GFP_KERNEL); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
852 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0 /* * Dynamic byte queue limits. See include/linux/dynamic_queue_limits.h * * Copyright (c) 2011, Tom Herbert <therbert@google.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/dynamic_queue_limits.h> #include <linux/compiler.h> #include <linux/export.h> #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0) /* Records completed count and recalculates the queue limit */ void dql_completed(struct dql *dql, unsigned int count) { unsigned int inprogress, prev_inprogress, limit; unsigned int ovlimit, completed, num_queued; bool all_prev_completed; num_queued = READ_ONCE(dql->num_queued); /* Can't complete more than what's in queue */ BUG_ON(count > num_queued - dql->num_completed); completed = dql->num_completed + count; limit = dql->limit; ovlimit = POSDIFF(num_queued - dql->num_completed, limit); inprogress = num_queued - completed; prev_inprogress = dql->prev_num_queued - dql->num_completed; all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued); if ((ovlimit && !inprogress) || (dql->prev_ovlimit && all_prev_completed)) { /* * Queue considered starved if: * - The queue was over-limit in the last interval, * and there is no more data in the queue. * OR * - The queue was over-limit in the previous interval and * when enqueuing it was possible that all queued data * had been consumed. This covers the case when queue * may have becomes starved between completion processing * running and next time enqueue was scheduled. * * When queue is starved increase the limit by the amount * of bytes both sent and completed in the last interval, * plus any previous over-limit. */ limit += POSDIFF(completed, dql->prev_num_queued) + dql->prev_ovlimit; dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } else if (inprogress && prev_inprogress && !all_prev_completed) { /* * Queue was not starved, check if the limit can be decreased. * A decrease is only considered if the queue has been busy in * the whole interval (the check above). * * If there is slack, the amount of excess data queued above * the amount needed to prevent starvation, the queue limit * can be decreased. To avoid hysteresis we consider the * minimum amount of slack found over several iterations of the * completion routine. */ unsigned int slack, slack_last_objs; /* * Slack is the maximum of * - The queue limit plus previous over-limit minus twice * the number of objects completed. Note that two times * number of completed bytes is a basis for an upper bound * of the limit. * - Portion of objects in the last queuing operation that * was not part of non-zero previous over-limit. That is * "round down" by non-overlimit portion of the last * queueing operation. */ slack = POSDIFF(limit + dql->prev_ovlimit, 2 * (completed - dql->num_completed)); slack_last_objs = dql->prev_ovlimit ? POSDIFF(dql->prev_last_obj_cnt, dql->prev_ovlimit) : 0; slack = max(slack, slack_last_objs); if (slack < dql->lowest_slack) dql->lowest_slack = slack; if (time_after(jiffies, dql->slack_start_time + dql->slack_hold_time)) { limit = POSDIFF(limit, dql->lowest_slack); dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } } /* Enforce bounds on limit */ limit = clamp(limit, dql->min_limit, dql->max_limit); if (limit != dql->limit) { dql->limit = limit; ovlimit = 0; } dql->adj_limit = limit + completed; dql->prev_ovlimit = ovlimit; dql->prev_last_obj_cnt = dql->last_obj_cnt; dql->num_completed = completed; dql->prev_num_queued = num_queued; } EXPORT_SYMBOL(dql_completed); void dql_reset(struct dql *dql) { /* Reset all dynamic values */ dql->limit = 0; dql->num_queued = 0; dql->num_completed = 0; dql->last_obj_cnt = 0; dql->prev_num_queued = 0; dql->prev_last_obj_cnt = 0; dql->prev_ovlimit = 0; dql->lowest_slack = UINT_MAX; dql->slack_start_time = jiffies; } EXPORT_SYMBOL(dql_reset); void dql_init(struct dql *dql, unsigned int hold_time) { dql->max_limit = DQL_MAX_LIMIT; dql->min_limit = 0; dql->slack_hold_time = hold_time; dql_reset(dql); } EXPORT_SYMBOL(dql_init);
6 6 6 6 6 6 2 4 5 4 1 6 5 6 4 2 2 2 3 3 2 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * "Ping" sockets * * Based on ipv4/udp.c code. * * Authors: Vasiliy Kulikov / Openwall (for Linux 2.6), * Pavel Kankovsky (for Linux 2.4.32) * * Pavel gave all rights to bugs to Vasiliy, * none of the bugs are Pavel's now. */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/snmp.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/export.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> #include <net/ping.h> #include <net/udp.h> #include <net/route.h> #include <net/inet_common.h> #include <net/checksum.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/in6.h> #include <linux/icmpv6.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/transp_v6.h> #endif struct ping_table { struct hlist_head hash[PING_HTABLE_SIZE]; spinlock_t lock; }; static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_SYMBOL_GPL(pingv6_ops); static u16 ping_port_rover; static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { u32 res = (num + net_hash_mix(net)) & mask; pr_debug("hash(%u) = %u\n", num, res); return res; } EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) { return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)]; } int ping_get_port(struct sock *sk, unsigned short ident) { struct inet_sock *isk, *isk2; struct hlist_head *hlist; struct sock *sk2 = NULL; isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { u32 i; u16 result = ping_port_rover + 1; for (i = 0; i < (1L << 16); i++, result++) { if (!result) result++; /* avoid zero */ hlist = ping_hashslot(&ping_table, sock_net(sk), result); sk_for_each(sk2, hlist) { isk2 = inet_sk(sk2); if (isk2->inet_num == result) goto next_port; } /* found */ ping_port_rover = ident = result; break; next_port: ; } if (i >= (1L << 16)) goto fail; } else { hlist = ping_hashslot(&ping_table, sock_net(sk), ident); sk_for_each(sk2, hlist) { isk2 = inet_sk(sk2); /* BUG? Why is this reuse and not reuseaddr? ping.c * doesn't turn off SO_REUSEADDR, and it doesn't expect * that other ping processes can steal its packets. */ if ((isk2->inet_num == ident) && (sk2 != sk) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } } pr_debug("found port/ident = %d\n", ident); isk->inet_num = ident; if (sk_unhashed(sk)) { pr_debug("was not hashed\n"); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } spin_unlock(&ping_table.lock); return 0; fail: spin_unlock(&ping_table.lock); return -EADDRINUSE; } EXPORT_SYMBOL_GPL(ping_get_port); int ping_hash(struct sock *sk) { pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ return 0; } void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); spin_lock(&ping_table.lock); if (sk_del_node_init_rcu(sk)) { isk->inet_num = 0; isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } spin_unlock(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_unhash); /* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident); struct sock *sk = NULL; struct inet_sock *isk; int dif, sdif; if (skb->protocol == htons(ETH_P_IP)) { dif = inet_iif(skb); sdif = inet_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", (int)ident, &ip_hdr(skb)->daddr, dif); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { dif = inet6_iif(skb); sdif = inet6_sdif(skb); pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n", (int)ident, &ipv6_hdr(skb)->daddr, dif); #endif } else { return NULL; } sk_for_each_rcu(sk, hslot) { isk = inet_sk(sk); pr_debug("iterate\n"); if (isk->inet_num != ident) continue; if (skb->protocol == htons(ETH_P_IP) && sk->sk_family == AF_INET) { pr_debug("found: %p: num=%d, daddr=%pI4, dif=%d\n", sk, (int) isk->inet_num, &isk->inet_rcv_saddr, sk->sk_bound_dev_if); if (isk->inet_rcv_saddr && isk->inet_rcv_saddr != ip_hdr(skb)->daddr) continue; #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6) && sk->sk_family == AF_INET6) { pr_debug("found: %p: num=%d, daddr=%pI6c, dif=%d\n", sk, (int) isk->inet_num, &sk->sk_v6_rcv_saddr, sk->sk_bound_dev_if); if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, &ipv6_hdr(skb)->daddr)) continue; #endif } else { continue; } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && sk->sk_bound_dev_if != sdif) continue; goto exit; } sk = NULL; exit: return sk; } static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, kgid_t *high) { kgid_t *data = net->ipv4.ping_group_range.range; unsigned int seq; do { seq = read_seqbegin(&net->ipv4.ping_group_range.lock); *low = data[0]; *high = data[1]; } while (read_seqretry(&net->ipv4.ping_group_range.lock, seq)); } int ping_init_sock(struct sock *sk) { struct net *net = sock_net(sk); kgid_t group = current_egid(); struct group_info *group_info; int i; kgid_t low, high; int ret = 0; if (sk->sk_family == AF_INET6) sk->sk_ipv6only = 1; inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; group_info = get_current_groups(); for (i = 0; i < group_info->ngroups; i++) { kgid_t gid = group_info->gid[i]; if (gid_lte(low, gid) && gid_lte(gid, high)) goto out_release_group; } ret = -EACCES; out_release_group: put_group_info(group_info); return ret; } EXPORT_SYMBOL_GPL(ping_init_sock); void ping_close(struct sock *sk, long timeout) { pr_debug("ping_close(sk=%p,sk->num=%u)\n", inet_sk(sk), inet_sk(sk)->inet_num); pr_debug("isk->refcnt = %d\n", refcount_read(&sk->sk_refcnt)); sk_common_release(sk); } EXPORT_SYMBOL_GPL(ping_close); static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len. */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, struct sockaddr *uaddr, int addr_len) { struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; u32 tb_id = RT_TABLE_LOCAL; int chk_addr_ret; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin_family != AF_INET && !(addr->sin_family == AF_UNSPEC && addr->sin_addr.s_addr == htonl(INADDR_ANY))) return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) return 0; tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST || (chk_addr_ret != RTN_LOCAL && !inet_can_nonlocal_bind(net, isk))) return -EADDRNOTAVAIL; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; int addr_type, scoped, has_addr; struct net_device *dev = NULL; if (addr_len < sizeof(*addr)) return -EINVAL; if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); addr_type = ipv6_addr_type(&addr->sin6_addr); scoped = __ipv6_addr_needs_scope_id(addr_type); if ((addr_type != IPV6_ADDR_ANY && !(addr_type & IPV6_ADDR_UNICAST)) || (scoped && !addr->sin6_scope_id)) return -EINVAL; rcu_read_lock(); if (addr->sin6_scope_id) { dev = dev_get_by_index_rcu(net, addr->sin6_scope_id); if (!dev) { rcu_read_unlock(); return -ENODEV; } } if (!dev && sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); if (!dev) { rcu_read_unlock(); return -ENODEV; } } has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, scoped); rcu_read_unlock(); if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; if (scoped) sk->sk_bound_dev_if = addr->sin6_scope_id; #endif } else { return -EAFNOSUPPORT; } return 0; } static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) { if (saddr->sa_family == AF_INET) { struct inet_sock *isk = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) saddr; isk->inet_rcv_saddr = isk->inet_saddr = addr->sin_addr.s_addr; #if IS_ENABLED(CONFIG_IPV6) } else if (saddr->sa_family == AF_INET6) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *) saddr; struct ipv6_pinfo *np = inet6_sk(sk); sk->sk_v6_rcv_saddr = np->saddr = addr->sin6_addr; #endif } } /* * We need our own bind because there are no privileged id's == local ports. * Moreover, we don't allow binding to multi- and broadcast addresses. */ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *isk = inet_sk(sk); unsigned short snum; int err; int dif = sk->sk_bound_dev_if; err = ping_check_bind_addr(sk, isk, uaddr, addr_len); if (err) return err; lock_sock(sk); err = -EINVAL; if (isk->inet_num != 0) goto out; err = -EADDRINUSE; snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port); if (ping_get_port(sk, snum) != 0) { /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */ sk->sk_bound_dev_if = dif; goto out; } ping_set_saddr(sk, uaddr); pr_debug("after bind(): num = %hu, dif = %d\n", isk->inet_num, sk->sk_bound_dev_if); err = 0; if (sk->sk_family == AF_INET && isk->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6 && !ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; #endif if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; isk->inet_sport = htons(isk->inet_num); isk->inet_daddr = 0; isk->inet_dport = 0; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); #endif sk_dst_reset(sk); out: release_sock(sk); pr_debug("ping_v4_bind -> %d\n", err); return err; } EXPORT_SYMBOL_GPL(ping_bind); /* * Is this a supported type of ICMP message? */ static inline int ping_supported(int family, int type, int code) { return (family == AF_INET && type == ICMP_ECHO && code == 0) || (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) || (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) || (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0); } /* * This routine is called by the ICMP module when it gets some * sort of error condition. */ void ping_err(struct sk_buff *skb, int offset, u32 info) { int family; struct icmphdr *icmph; struct inet_sock *inet_sock; int type; int code; struct net *net = dev_net(skb->dev); struct sock *sk; int harderr; int err; if (skb->protocol == htons(ETH_P_IP)) { family = AF_INET; type = icmp_hdr(skb)->type; code = icmp_hdr(skb)->code; icmph = (struct icmphdr *)(skb->data + offset); } else if (skb->protocol == htons(ETH_P_IPV6)) { family = AF_INET6; type = icmp6_hdr(skb)->icmp6_type; code = icmp6_hdr(skb)->icmp6_code; icmph = (struct icmphdr *) (skb->data + offset); } else { BUG(); } /* We assume the packet has already been checked by icmp_unreach */ if (!ping_supported(family, icmph->type, icmph->code)) return; pr_debug("ping_err(proto=0x%x,type=%d,code=%d,id=%04x,seq=%04x)\n", skb->protocol, type, code, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (!sk) { pr_debug("no socket, dropping\n"); return; /* No socket for error */ } pr_debug("err on socket %p\n", sk); err = 0; harderr = 0; inet_sock = inet_sk(sk); if (skb->protocol == htons(ETH_P_IP)) { switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: /* This is not a real error but ping wants to see it. * Report it with some fake errno. */ err = EREMOTEIO; break; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; break; } goto out; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_REDIRECT: /* See ICMP_SOURCE_QUENCH */ ipv4_sk_redirect(skb, sk); err = EREMOTEIO; break; } #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { harderr = pingv6_ops.icmpv6_err_convert(type, code, &err); #endif } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) || (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else { if (family == AF_INET) { ip_icmp_error(sk, skb, err, 0 /* no remote port */, info, (u8 *)icmph); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { pingv6_ops.ipv6_icmp_error(sk, skb, err, 0, info, (u8 *)icmph); #endif } } sk->sk_err = err; sk_error_report(sk); out: return; } EXPORT_SYMBOL_GPL(ping_err); /* * Copy and checksum an ICMP Echo packet from user space into a buffer * starting from the payload. */ int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *skb) { struct pingfakehdr *pfh = from; if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, &pfh->msg->msg_iter)) return -EFAULT; #if IS_ENABLED(CONFIG_IPV6) /* For IPv6, checksum each skb as we go along, as expected by * icmpv6_push_pending_frames. For IPv4, accumulate the checksum in * wcheck, it will be finalized in ping_v4_push_pending_frames. */ if (pfh->family == AF_INET6) { skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd); skb->ip_summed = CHECKSUM_NONE; pfh->wcheck = 0; } #endif return 0; } EXPORT_SYMBOL_GPL(ping_getfrag); static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4) { struct sk_buff *skb = skb_peek(&sk->sk_write_queue); if (!skb) return 0; pfh->wcheck = csum_partial((char *)&pfh->icmph, sizeof(struct icmphdr), pfh->wcheck); pfh->icmph.checksum = csum_fold(pfh->wcheck); memcpy(icmp_hdr(skb), &pfh->icmph, sizeof(struct icmphdr)); skb->ip_summed = CHECKSUM_NONE; return ip_push_pending_frames(sk, fl4); } int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len) { u8 type, code; if (len > 0xFFFF) return -EMSGSIZE; /* Must have at least a full ICMP header. */ if (len < icmph_len) return -EINVAL; /* * Check the flags. */ /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* * Fetch the ICMP header provided by the userland. * iovec is modified! The ICMP header is consumed. */ if (memcpy_from_msg(user_icmph, msg, icmph_len)) return -EFAULT; if (family == AF_INET) { type = ((struct icmphdr *) user_icmph)->type; code = ((struct icmphdr *) user_icmph)->code; #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { type = ((struct icmp6hdr *) user_icmph)->icmp6_type; code = ((struct icmp6hdr *) user_icmph)->icmp6_code; #endif } else { BUG(); } if (!ping_supported(family, type, code)) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct net *net = sock_net(sk); struct flowi4 fl4; struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct icmphdr user_icmph; struct pingfakehdr pfh; struct rtable *rt = NULL; struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; u8 tos, scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); err = ping_common_sendmsg(AF_INET, msg, len, &user_icmph, sizeof(user_icmph)); if (err) return err; /* * Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; daddr = usin->sin_addr.s_addr; /* no remote port */ } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; /* no remote port */ } ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; goto out_free; } faddr = ipc.opt->opt.faddr; } tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); pfh.icmph.type = user_icmph.type; /* already checked */ pfh.icmph.code = user_icmph.code; /* ditto */ pfh.icmph.checksum = 0; pfh.icmph.un.echo.id = inet->inet_sport; pfh.icmph.un.echo.sequence = user_icmph.un.echo.sequence; pfh.msg = msg; pfh.wcheck = 0; pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, sizeof(struct icmphdr), &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else err = ping_v4_push_pending_frames(sk, &pfh, &fl4); release_sock(sk); out: ip_rt_put(rt); out_free: if (free) kfree(ipc.opt); if (!err) { icmp_out_count(sock_net(sk), user_icmph.type); return len; } return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *isk = inet_sk(sk); int family = sk->sk_family; struct sk_buff *skb; int copied, err; pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); err = -EOPNOTSUPP; if (flags & MSG_OOB) goto out; if (flags & MSG_ERRQUEUE) return inet_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* Don't bother checking the checksum */ err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address and add cmsg data. */ if (family == AF_INET) { DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); if (sin) { sin->sin_family = AF_INET; sin->sin_port = 0 /* skb->h.uh->source */; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #if IS_ENABLED(CONFIG_IPV6) } else if (family == AF_INET6) { struct ipv6hdr *ip6 = ipv6_hdr(skb); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; sin6->sin6_addr = ip6->saddr; sin6->sin6_flowinfo = 0; if (inet6_test_bit(SNDFLOW, sk)) sin6->sin6_flowinfo = ip6_flowinfo(ip6); sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); *addr_len = sizeof(*sin6); } if (inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6) && inet6_sk(sk)->rxopt.all) pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb); else if (skb->protocol == htons(ETH_P_IP) && inet_cmsg_flags(isk)) ip_cmsg_recv(msg, skb); #endif } else { BUG(); } err = copied; done: skb_free_datagram(sk, skb); out: pr_debug("ping_recvmsg -> %d\n", err); return err; } EXPORT_SYMBOL_GPL(ping_recvmsg); static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { kfree_skb_reason(skb, reason); pr_debug("ping_queue_rcv_skb -> failed\n"); return reason; } return SKB_NOT_DROPPED_YET; } int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return __ping_queue_rcv_skb(sk, skb) ? -1 : 0; } EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); /* * All we need to do is get the socket. */ enum skb_drop_reason ping_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NO_SOCKET; struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); /* We assume the packet has already been checked by icmp_rcv */ pr_debug("ping_rcv(skb=%p,id=%04x,seq=%04x)\n", skb, ntohs(icmph->un.echo.id), ntohs(icmph->un.echo.sequence)); /* Push ICMP header back */ skb_push(skb, skb->data - (u8 *)icmph); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); pr_debug("rcv on socket %p\n", sk); if (skb2) reason = __ping_queue_rcv_skb(sk, skb2); else reason = SKB_DROP_REASON_NOMEM; } if (reason) pr_debug("no socket, dropping\n"); return reason; } EXPORT_SYMBOL_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, .pre_connect = ping_pre_connect, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = ping_v4_sendmsg, .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct inet_sock), }; EXPORT_SYMBOL(ping_prot); #ifdef CONFIG_PROC_FS static struct sock *ping_get_first(struct seq_file *seq, int start) { struct sock *sk; struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); for (state->bucket = start; state->bucket < PING_HTABLE_SIZE; ++state->bucket) { struct hlist_head *hslot; hslot = &ping_table.hash[state->bucket]; if (hlist_empty(hslot)) continue; sk_for_each(sk, hslot) { if (net_eq(sock_net(sk), net) && sk->sk_family == state->family) goto found; } } sk = NULL; found: return sk; } static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk) { struct ping_iter_state *state = seq->private; struct net *net = seq_file_net(seq); do { sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net))); if (!sk) return ping_get_first(seq, state->bucket + 1); return sk; } static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = ping_get_first(seq, 0); if (sk) while (pos && (sk = ping_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) __acquires(ping_table.lock) { struct ping_iter_state *state = seq->private; state->bucket = 0; state->family = family; spin_lock(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(ping_seq_start); static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) { return ping_seq_start(seq, pos, AF_INET); } void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = ping_get_idx(seq, 0); else sk = ping_get_next(seq, v); ++*pos; return sk; } EXPORT_SYMBOL_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) __releases(ping_table.lock) { spin_unlock(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct ping_iter_state *state = seq->private; ping_v4_format_sock(v, seq, state->bucket); } seq_pad(seq, '\n'); return 0; } static const struct seq_operations ping_v4_seq_ops = { .start = ping_v4_seq_start, .show = ping_v4_seq_show, .next = ping_seq_next, .stop = ping_seq_stop, }; static int __net_init ping_v4_proc_init_net(struct net *net) { if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; return 0; } static void __net_exit ping_v4_proc_exit_net(struct net *net) { remove_proc_entry("icmp", net->proc_net); } static struct pernet_operations ping_v4_net_ops = { .init = ping_v4_proc_init_net, .exit = ping_v4_proc_exit_net, }; int __init ping_proc_init(void) { return register_pernet_subsys(&ping_v4_net_ops); } void ping_proc_exit(void) { unregister_pernet_subsys(&ping_v4_net_ops); } #endif void __init ping_init(void) { int i; for (i = 0; i < PING_HTABLE_SIZE; i++) INIT_HLIST_HEAD(&ping_table.hash[i]); spin_lock_init(&ping_table.lock); }
5312 6 1288 1291 2340 3691 1153 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VMSTAT_H #define _LINUX_VMSTAT_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/mmzone.h> #include <linux/vm_event_item.h> #include <linux/atomic.h> #include <linux/static_key.h> #include <linux/mmdebug.h> extern int sysctl_stat_interval; #ifdef CONFIG_NUMA #define ENABLE_NUMA_STAT 1 #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { unsigned nr_dirty; unsigned nr_unqueued_dirty; unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; }; enum writeback_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, NR_VM_WRITEBACK_STAT_ITEMS, }; #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. * * Counters should only be incremented and no critical kernel component * should rely on the counter values. * * Counters are handled completely inline. On many platforms the code * generated will simply be the increment of a global address. */ struct vm_event_state { unsigned long event[NR_VM_EVENT_ITEMS]; }; DECLARE_PER_CPU(struct vm_event_state, vm_event_states); /* * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the * local_irq_disable overhead. */ static inline void __count_vm_event(enum vm_event_item item) { raw_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) { this_cpu_inc(vm_event_states.event[item]); } static inline void __count_vm_events(enum vm_event_item item, long delta) { raw_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) { this_cpu_add(vm_event_states.event[item], delta); } extern void all_vm_events(unsigned long *); extern void vm_events_fold_cpu(int cpu); #else /* Disable counters */ static inline void count_vm_event(enum vm_event_item item) { } static inline void count_vm_events(enum vm_event_item item, long delta) { } static inline void __count_vm_event(enum vm_event_item item) { } static inline void __count_vm_events(enum vm_event_item item, long delta) { } static inline void all_vm_events(unsigned long *ret) { } static inline void vm_events_fold_cpu(int cpu) { } #endif /* CONFIG_VM_EVENT_COUNTERS */ #ifdef CONFIG_NUMA_BALANCING #define count_vm_numa_event(x) count_vm_event(x) #define count_vm_numa_events(x, y) count_vm_events(x, y) #else #define count_vm_numa_event(x) do {} while (0) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_DEBUG_TLBFLUSH #define count_vm_tlb_event(x) count_vm_event(x) #define count_vm_tlb_events(x, y) count_vm_events(x, y) #else #define count_vm_tlb_event(x) do {} while (0) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif #ifdef CONFIG_PER_VMA_LOCK_STATS #define count_vm_vma_lock_event(x) count_vm_event(x) #else #define count_vm_vma_lock_event(x) do {} while (0) #endif #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) /* * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #ifdef CONFIG_NUMA static inline void zone_numa_event_add(long x, struct zone *zone, enum numa_stat_item item) { atomic_long_add(x, &zone->vm_numa_event[item]); atomic_long_add(x, &vm_numa_event[item]); } static inline unsigned long zone_numa_event_state(struct zone *zone, enum numa_stat_item item) { return atomic_long_read(&zone->vm_numa_event[item]); } static inline unsigned long global_numa_event_state(enum numa_stat_item item) { return atomic_long_read(&vm_numa_event[item]); } #endif /* CONFIG_NUMA */ static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { atomic_long_add(x, &zone->vm_stat[item]); atomic_long_add(x, &vm_zone_stat[item]); } static inline void node_page_state_add(long x, struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_add(x, &pgdat->vm_stat[item]); atomic_long_add(x, &vm_node_stat[item]); } static inline unsigned long global_zone_page_state(enum zone_stat_item item) { long x = atomic_long_read(&vm_zone_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state_pages(enum node_stat_item item) { long x = atomic_long_read(&vm_node_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state(enum node_stat_item item) { VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); return global_node_page_state_pages(item); } static inline unsigned long zone_page_state(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } /* * More accurate version that also considers the currently pending * deltas. For that we need to loop over all cpus to find the current * deltas. There is no synchronization so the result cannot be * exactly accurate either. */ static inline unsigned long zone_page_state_snapshot(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; #endif return x; } #ifdef CONFIG_NUMA /* See __count_vm_event comment on why raw_cpu_inc is used. */ static inline void __count_numa_event(struct zone *zone, enum numa_stat_item item) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; raw_cpu_inc(pzstats->vm_numa_event[item]); } static inline void __count_numa_events(struct zone *zone, enum numa_stat_item item, long delta) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; raw_cpu_add(pzstats->vm_numa_event[item], delta); } extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); extern unsigned long node_page_state_pages(struct pglist_data *pgdat, enum node_stat_item item); extern void fold_vm_numa_events(void); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #define node_page_state_pages(node, item) global_node_page_state_pages(item) static inline void fold_vm_numa_events(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __inc_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item); void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long); void __inc_node_page_state(struct page *, enum node_stat_item); void __dec_node_page_state(struct page *, enum node_stat_item); void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void inc_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item); void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); void set_pgdat_percpu_threshold(pg_data_t *pgdat, int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* * We do not maintain differentials in a single processor configuration. * The functions directly modify the zone and global counters. */ static inline void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { zone_page_state_add(delta, zone, item); } static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { if (vmstat_item_in_bytes(item)) { /* * Only cgroups use subpage accounting right now; at * the global level, these items still change in * multiples of whole pages. Store them as pages * internally to keep the per-cpu counters compact. */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } node_page_state_add(delta, pgdat, item); } static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_inc(&zone->vm_stat[item]); atomic_long_inc(&vm_zone_stat[item]); } static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_inc(&pgdat->vm_stat[item]); atomic_long_inc(&vm_node_stat[item]); } static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&vm_zone_stat[item]); } static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_dec(&pgdat->vm_stat[item]); atomic_long_dec(&vm_node_stat[item]); } static inline void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } static inline void __inc_node_page_state(struct page *page, enum node_stat_item item) { __inc_node_state(page_pgdat(page), item); } static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } static inline void __dec_node_page_state(struct page *page, enum node_stat_item item) { __dec_node_state(page_pgdat(page), item); } /* * We only use atomic operations to update counters. So there is no need to * disable interrupts. */ #define inc_zone_page_state __inc_zone_page_state #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state #define inc_node_page_state __inc_node_page_state #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state #define inc_zone_state __inc_zone_state #define inc_node_state __inc_node_state #define dec_zone_state __dec_zone_state #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ static inline void __zone_stat_mod_folio(struct folio *folio, enum zone_stat_item item, long nr) { __mod_zone_page_state(folio_zone(folio), item, nr); } static inline void __zone_stat_add_folio(struct folio *folio, enum zone_stat_item item) { __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); } static inline void __zone_stat_sub_folio(struct folio *folio, enum zone_stat_item item) { __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); } static inline void zone_stat_mod_folio(struct folio *folio, enum zone_stat_item item, long nr) { mod_zone_page_state(folio_zone(folio), item, nr); } static inline void zone_stat_add_folio(struct folio *folio, enum zone_stat_item item) { mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); } static inline void zone_stat_sub_folio(struct folio *folio, enum zone_stat_item item) { mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); } static inline void __node_stat_mod_folio(struct folio *folio, enum node_stat_item item, long nr) { __mod_node_page_state(folio_pgdat(folio), item, nr); } static inline void __node_stat_add_folio(struct folio *folio, enum node_stat_item item) { __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); } static inline void __node_stat_sub_folio(struct folio *folio, enum node_stat_item item) { __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } static inline void node_stat_mod_folio(struct folio *folio, enum node_stat_item item, long nr) { mod_node_page_state(folio_pgdat(folio), item, nr); } static inline void node_stat_add_folio(struct folio *folio, enum node_stat_item item) { mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); } static inline void node_stat_sub_folio(struct folio *folio, enum node_stat_item item) { mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, int migratetype) { __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) { return vmstat_text[item]; } #ifdef CONFIG_NUMA static inline const char *numa_stat_name(enum numa_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + item]; } #endif /* CONFIG_NUMA */ static inline const char *node_stat_name(enum node_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + item]; } static inline const char *lru_list_name(enum lru_list lru) { return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } static inline const char *writeback_stat_name(enum writeback_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + item]; } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + NR_VM_WRITEBACK_STAT_ITEMS + item]; } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ #ifdef CONFIG_MEMCG void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_state(lruvec, idx, val); local_irq_restore(flags); } void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val); static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_page_state(page, idx, val); local_irq_restore(flags); } #else static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { __mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { mod_node_page_state(page_pgdat(page), idx, val); } #endif /* CONFIG_MEMCG */ static inline void __inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { __mod_lruvec_page_state(page, idx, 1); } static inline void __dec_lruvec_page_state(struct page *page, enum node_stat_item idx) { __mod_lruvec_page_state(page, idx, -1); } static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { __mod_lruvec_page_state(&folio->page, idx, val); } static inline void __lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); } static inline void __lruvec_stat_sub_folio(struct folio *folio, enum node_stat_item idx) { __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } static inline void inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { mod_lruvec_page_state(page, idx, 1); } static inline void dec_lruvec_page_state(struct page *page, enum node_stat_item idx) { mod_lruvec_page_state(page, idx, -1); } static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { mod_lruvec_page_state(&folio->page, idx, val); } static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); } static inline void lruvec_stat_sub_folio(struct folio *folio, enum node_stat_item idx) { lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } #endif /* _LINUX_VMSTAT_H */
107 1261 2038 2206 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt.h: declarations for per-file encryption * * Filesystems that implement per-file encryption must include this header * file. * * Copyright (C) 2015, Google, Inc. * * Written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. */ #ifndef _LINUX_FSCRYPT_H #define _LINUX_FSCRYPT_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/slab.h> #include <uapi/linux/fscrypt.h> /* * The lengths of all file contents blocks must be divisible by this value. * This is needed to ensure that all contents encryption modes will work, as * some of the supported modes don't support arbitrarily byte-aligned messages. * * Since the needed alignment is 16 bytes, most filesystems will meet this * requirement naturally, as typical block sizes are powers of 2. However, if a * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via * compression), then it will need to pad to this alignment before encryption. */ #define FSCRYPT_CONTENTS_ALIGNMENT 16 union fscrypt_policy; struct fscrypt_inode_info; struct fs_parameter; struct seq_file; struct fscrypt_str { unsigned char *name; u32 len; }; struct fscrypt_name { const struct qstr *usr_fname; struct fscrypt_str disk_name; u32 hash; u32 minor_hash; struct fscrypt_str crypto_buf; bool is_nokey_name; }; #define FSTR_INIT(n, l) { .name = n, .len = l } #define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) /* Maximum value for the third parameter of fscrypt_operations.set_context(). */ #define FSCRYPT_SET_CONTEXT_MAX_SIZE 40 #ifdef CONFIG_FS_ENCRYPTION /* Crypto operations for filesystems */ struct fscrypt_operations { /* * If set, then fs/crypto/ will allocate a global bounce page pool the * first time an encryption key is set up for a file. The bounce page * pool is required by the following functions: * * - fscrypt_encrypt_pagecache_blocks() * - fscrypt_zeroout_range() for files not using inline crypto * * If the filesystem doesn't use those, it doesn't need to set this. */ unsigned int needs_bounce_pages : 1; /* * If set, then fs/crypto/ will allow the use of encryption settings * that assume inode numbers fit in 32 bits (i.e. * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other * prerequisites for these settings are also met. This is only useful * if the filesystem wants to support inline encryption hardware that is * limited to 32-bit or 64-bit data unit numbers and where programming * keyslots is very slow. */ unsigned int has_32bit_inodes : 1; /* * If set, then fs/crypto/ will allow users to select a crypto data unit * size that is less than the filesystem block size. This is done via * the log2_data_unit_size field of the fscrypt policy. This flag is * not compatible with filesystems that encrypt variable-length blocks * (i.e. blocks that aren't all equal to filesystem's block size), for * example as a result of compression. It's also not compatible with * the fscrypt_encrypt_block_inplace() and * fscrypt_decrypt_block_inplace() functions. */ unsigned int supports_subblock_data_units : 1; /* * This field exists only for backwards compatibility reasons and should * only be set by the filesystems that are setting it already. It * contains the filesystem-specific key description prefix that is * accepted for "logon" keys for v1 fscrypt policies. This * functionality is deprecated in favor of the generic prefix * "fscrypt:", which itself is deprecated in favor of the filesystem * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY. Filesystems that * are newly adding fscrypt support should not set this field. */ const char *legacy_key_prefix; /* * Get the fscrypt context of the given inode. * * @inode: the inode whose context to get * @ctx: the buffer into which to get the context * @len: length of the @ctx buffer in bytes * * Return: On success, returns the length of the context in bytes; this * may be less than @len. On failure, returns -ENODATA if the * inode doesn't have a context, -ERANGE if the context is * longer than @len, or another -errno code. */ int (*get_context)(struct inode *inode, void *ctx, size_t len); /* * Set an fscrypt context on the given inode. * * @inode: the inode whose context to set. The inode won't already have * an fscrypt context. * @ctx: the context to set * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE) * @fs_data: If called from fscrypt_set_context(), this will be the * value the filesystem passed to fscrypt_set_context(). * Otherwise (i.e. when called from * FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL. * * i_rwsem will be held for write. * * Return: 0 on success, -errno on failure. */ int (*set_context)(struct inode *inode, const void *ctx, size_t len, void *fs_data); /* * Get the dummy fscrypt policy in use on the filesystem (if any). * * Filesystems only need to implement this function if they support the * test_dummy_encryption mount option. * * Return: A pointer to the dummy fscrypt policy, if the filesystem is * mounted with test_dummy_encryption; otherwise NULL. */ const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb); /* * Check whether a directory is empty. i_rwsem will be held for write. */ bool (*empty_dir)(struct inode *inode); /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations * such as filesystem shrinking and therefore can be used in the * encryption without the possibility of files becoming unreadable. * * Filesystems only need to implement this function if they want to * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags. These * flags are designed to work around the limitations of UFS and eMMC * inline crypto hardware, and they shouldn't be used in scenarios where * such hardware isn't being used. * * Leaving this NULL is equivalent to always returning false. */ bool (*has_stable_inodes)(struct super_block *sb); /* * Return an array of pointers to the block devices to which the * filesystem may write encrypted file contents, NULL if the filesystem * only has a single such block device, or an ERR_PTR() on error. * * On successful non-NULL return, *num_devs is set to the number of * devices in the returned array. The caller must free the returned * array using kfree(). * * If the filesystem can use multiple block devices (other than block * devices that aren't used for encrypted file contents, such as * external journal devices), and wants to support inline encryption, * then it must implement this function. Otherwise it's not needed. */ struct block_device **(*get_devices)(struct super_block *sb, unsigned int *num_devs); }; static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). * I.e., another task may publish ->i_crypt_info concurrently, executing * a RELEASE barrier. We need to use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_crypt_info); } /** * fscrypt_needs_contents_encryption() - check whether an inode needs * contents encryption * @inode: the inode to check * * Return: %true iff the inode is an encrypted regular file and the kernel was * built with fscrypt support. * * If you need to know whether the encrypt bit is set even when the kernel was * built without fscrypt support, you must use IS_ENCRYPTED() directly instead. */ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } /* * When d_splice_alias() moves a directory's no-key alias to its plaintext alias * as a result of the encryption key being added, DCACHE_NOKEY_NAME must be * cleared. Note that we don't have to support arbitrary moves of this flag * because fscrypt doesn't allow no-key names to be the source or target of a * rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { dentry->d_flags &= ~DCACHE_NOKEY_NAME; } /** * fscrypt_is_nokey_name() - test whether a dentry is a no-key name * @dentry: the dentry to check * * This returns true if the dentry is a no-key dentry. A no-key dentry is a * dentry that was created in an encrypted directory that hasn't had its * encryption key added yet. Such dentries may be either positive or negative. * * When a filesystem is asked to create a new filename in an encrypted directory * and the new filename's dentry is a no-key dentry, it must fail the operation * with ENOKEY. This includes ->create(), ->mkdir(), ->mknod(), ->symlink(), * ->rename(), and ->link(). (However, ->rename() and ->link() are already * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().) * * This is necessary because creating a filename requires the directory's * encryption key, but just checking for the key on the directory inode during * the final filesystem operation doesn't guarantee that the key was available * during the preceding dentry lookup. And the key must have already been * available during the dentry lookup in order for it to have been checked * whether the filename already exists in the directory and for the new file's * dentry not to be invalidated due to it incorrectly having the no-key flag. * * Return: %true if the dentry is a no-key name */ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return dentry->d_flags & DCACHE_NOKEY_NAME; } /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs); int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num); static inline bool fscrypt_is_bounce_page(struct page *page) { return page->mapping == NULL; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { return (struct page *)page_private(bounce_page); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return folio->mapping == NULL; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { return bounce_folio->private; } void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { const union fscrypt_policy *policy; }; int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy); bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return dummy_policy->policy != NULL; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { kfree(dummy_policy->policy); dummy_policy->policy = NULL; } /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret); void fscrypt_put_encryption_info(struct inode *inode); void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str); void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname); bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_readdir(struct inode *dir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr); int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags); int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link); const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done); int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { sb->s_cop = s_cop; } #else /* !CONFIG_FS_ENCRYPTION */ static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { return NULL; } static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return false; } static inline void fscrypt_handle_d_move(struct dentry *dentry) { } static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return false; } /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { return -EOPNOTSUPP; } static inline bool fscrypt_is_bounce_page(struct page *page) { return false; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return false; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline void fscrypt_free_bounce_page(struct page *bounce_page) { } /* policy.c */ static inline int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { return 0; } static inline int fscrypt_set_context(struct inode *inode, void *fs_data) { return -EOPNOTSUPP; } struct fscrypt_dummy_policy { }; static inline int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy) { return -EINVAL; } static inline bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2) { return true; } static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { } static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return false; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { } /* keyring.c */ static inline void fscrypt_destroy_keyring(struct super_block *sb) { } static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } /* keysetup.c */ static inline int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; return 0; } static inline void fscrypt_put_encryption_info(struct inode *inode) { return; } static inline void fscrypt_free_inode(struct inode *inode) { } static inline int fscrypt_drop_inode(struct inode *inode) { return 0; } /* fname.c */ static inline int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; memset(fname, 0, sizeof(*fname)); fname->usr_fname = iname; fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } static inline void fscrypt_free_filename(struct fscrypt_name *fname) { return; } static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { return -EOPNOTSUPP; } static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { return; } static inline int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { return -EOPNOTSUPP; } static inline bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { /* Encryption support disabled; use standard comparison */ if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); } static inline u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { WARN_ON_ONCE(1); return 0; } static inline int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { return 1; } /* bio.c */ static inline bool fscrypt_decrypt_bio(struct bio *bio) { return true; } static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; } /* hooks.c */ static inline int fscrypt_file_open(struct inode *inode, struct file *filp) { if (IS_ENCRYPTED(inode)) return -EOPNOTSUPP; return 0; } static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_readdir(struct inode *dir) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { return 0; } static inline int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } static inline int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { return -EOPNOTSUPP; } static inline const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { return -EOPNOTSUPP; } static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { } #endif /* !CONFIG_FS_ENCRYPTION */ /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return false; } static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } static inline void fscrypt_set_bio_crypt_ctx_bh( struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { return true; } static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh) { return true; } static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); } static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) { return nr_blocks; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /** * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the block layer via blk-crypto rather * than in the filesystem layer. */ static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && __fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the filesystem layer rather than in the * block layer via blk-crypto. */ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && !__fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_has_encryption_key() - check whether an inode has had its key set up * @inode: the inode to check * * Return: %true if the inode has had its encryption key set up, else %false. * * Usually this should be preceded by fscrypt_get_encryption_info() to try to * set up the key first. */ static inline bool fscrypt_has_encryption_key(const struct inode *inode) { return fscrypt_get_inode_info(inode) != NULL; } /** * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted * directory * @old_dentry: an existing dentry for the inode being linked * @dir: the target directory * @dentry: negative dentry for the target filename * * A new link can only be added to an encrypted directory if the directory's * encryption key is available --- since otherwise we'd have no way to encrypt * the filename. * * We also verify that the link will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, * -EXDEV if the link would result in an inconsistent encryption policy, or * another -errno code. */ static inline int fscrypt_prepare_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry); return 0; } /** * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted * directories * @old_dir: source directory * @old_dentry: dentry for source file * @new_dir: target directory * @new_dentry: dentry for target location (may be negative unless exchanging) * @flags: rename flags (we care at least about %RENAME_EXCHANGE) * * Prepare for ->rename() where the source and/or target directories may be * encrypted. A new link can only be added to an encrypted directory if the * directory's encryption key is available --- since otherwise we'd have no way * to encrypt the filename. A rename to an existing name, on the other hand, * *is* cryptographically possible without the key. However, we take the more * conservative approach and just forbid all no-key renames. * * We also verify that the rename will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the * rename would cause inconsistent encryption policies, or another -errno code. */ static inline int fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) return __fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); return 0; } /** * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted * directory * @dir: directory being searched * @dentry: filename being looked up * @fname: (output) the name to use to search the on-disk directory * * Prepare for ->lookup() in a directory which may be encrypted by determining * the name that will actually be used to search the directory on-disk. If the * directory's encryption policy is supported by this kernel and its encryption * key is available, then the lookup is assumed to be by plaintext name; * otherwise, it is assumed to be by no-key name. * * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key * name. In this case the filesystem must assign the dentry a dentry_operations * which contains fscrypt_d_revalidate (or contains a d_revalidate method that * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the * directory's encryption key is later added. * * Return: 0 on success; -ENOENT if the directory's key is unavailable but the * filename isn't a valid no-key name, so a negative dentry should be created; * or another -errno code. */ static inline int fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_lookup(dir, dentry, fname); memset(fname, 0, sizeof(*fname)); fname->usr_fname = &dentry->d_name; fname->disk_name.name = (unsigned char *)dentry->d_name.name; fname->disk_name.len = dentry->d_name.len; return 0; } /** * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory * @dir: the directory inode * * If the directory is encrypted and it doesn't already have its encryption key * set up, try to set it up so that the filenames will be listed in plaintext * form rather than in no-key form. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ static inline int fscrypt_prepare_readdir(struct inode *dir) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_readdir(dir); return 0; } /** * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's * attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, * most attribute changes are allowed even without the encryption key. However, * without the encryption key we do have to forbid truncates. This is needed * because the size being truncated to may not be a multiple of the filesystem * block size, and in that case we'd have to decrypt the final block, zero the * portion past i_size, and re-encrypt it. (We *could* allow truncating to a * filesystem block boundary, but it's simpler to just forbid all truncates --- * and we already forbid all other contents modifications without the key.) * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_ENCRYPTED(d_inode(dentry))) return __fscrypt_prepare_setattr(dentry, attr); return 0; } /** * fscrypt_encrypt_symlink() - encrypt the symlink target if needed * @inode: symlink inode * @target: plaintext symlink target * @len: length of @target excluding null terminator * @disk_link: (in/out) the on-disk symlink target being prepared * * If the symlink target needs to be encrypted, then this function encrypts it * into @disk_link->name. fscrypt_prepare_symlink() must have been called * previously to compute @disk_link->len. If the filesystem did not allocate a * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one * will be kmalloc()'ed and the filesystem will be responsible for freeing it. * * Return: 0 on success, -errno on failure */ static inline int fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(inode)) return __fscrypt_encrypt_symlink(inode, target, len, disk_link); return 0; } /* If *pagep is a bounce page, free it and set *pagep to the pagecache page */ static inline void fscrypt_finalize_bounce_page(struct page **pagep) { struct page *page = *pagep; if (fscrypt_is_bounce_page(page)) { *pagep = fscrypt_pagecache_page(page); fscrypt_free_bounce_page(page); } } #endif /* _LINUX_FSCRYPT_H */
9 9 9 9 9 8 7 8 8 1 1 1 1 8 8 8 8 8 8 6 2 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 // SPDX-License-Identifier: GPL-2.0-or-later /* * Create default crypto algorithm instances. * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <linux/completion.h> #include <linux/ctype.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" struct cryptomgr_param { struct rtattr *tb[CRYPTO_MAX_ATTRS + 2]; struct { struct rtattr attr; struct crypto_attr_type data; } type; struct { struct rtattr attr; struct crypto_attr_alg data; } attrs[CRYPTO_MAX_ATTRS]; char template[CRYPTO_MAX_ALG_NAME]; struct crypto_larval *larval; u32 otype; u32 omask; }; struct crypto_test_param { char driver[CRYPTO_MAX_ALG_NAME]; char alg[CRYPTO_MAX_ALG_NAME]; u32 type; }; static int cryptomgr_probe(void *data) { struct cryptomgr_param *param = data; struct crypto_template *tmpl; int err; tmpl = crypto_lookup_template(param->template); if (!tmpl) goto out; do { err = tmpl->create(tmpl, param->tb); } while (err == -EAGAIN && !signal_pending(current)); crypto_tmpl_put(tmpl); out: complete_all(&param->larval->completion); crypto_alg_put(&param->larval->alg); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_probe(struct crypto_larval *larval) { struct task_struct *thread; struct cryptomgr_param *param; const char *name = larval->alg.cra_name; const char *p; unsigned int len; int i; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; for (p = name; isalnum(*p) || *p == '-' || *p == '_'; p++) ; len = p - name; if (!len || *p != '(') goto err_free_param; memcpy(param->template, name, len); i = 0; for (;;) { name = ++p; for (; isalnum(*p) || *p == '-' || *p == '_'; p++) ; if (*p == '(') { int recursion = 0; for (;;) { if (!*++p) goto err_free_param; if (*p == '(') recursion++; else if (*p == ')' && !recursion--) break; } p++; } len = p - name; if (!len) goto err_free_param; param->attrs[i].attr.rta_len = sizeof(param->attrs[i]); param->attrs[i].attr.rta_type = CRYPTOA_ALG; memcpy(param->attrs[i].data.name, name, len); param->tb[i + 1] = &param->attrs[i].attr; i++; if (i >= CRYPTO_MAX_ATTRS) goto err_free_param; if (*p == ')') break; if (*p != ',') goto err_free_param; } if (!i) goto err_free_param; param->tb[i + 1] = NULL; param->type.attr.rta_len = sizeof(param->type); param->type.attr.rta_type = CRYPTOA_TYPE; param->type.data.type = larval->alg.cra_flags & ~CRYPTO_ALG_TESTED; param->type.data.mask = larval->mask & ~CRYPTO_ALG_TESTED; param->tb[0] = &param->type.attr; param->otype = larval->alg.cra_flags; param->omask = larval->mask; crypto_alg_get(&larval->alg); param->larval = larval; thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe"); if (IS_ERR(thread)) goto err_put_larval; return NOTIFY_STOP; err_put_larval: crypto_alg_put(&larval->alg); err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_test(void *data) { struct crypto_test_param *param = data; u32 type = param->type; int err; err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); crypto_alg_tested(param->driver, err); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_test(struct crypto_alg *alg) { struct task_struct *thread; struct crypto_test_param *param; if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return NOTIFY_DONE; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver)); memcpy(param->alg, alg->cra_name, sizeof(param->alg)); param->type = alg->cra_flags; thread = kthread_run(cryptomgr_test, param, "cryptomgr_test"); if (IS_ERR(thread)) goto err_free_param; return NOTIFY_STOP; err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_notify(struct notifier_block *this, unsigned long msg, void *data) { switch (msg) { case CRYPTO_MSG_ALG_REQUEST: return cryptomgr_schedule_probe(data); case CRYPTO_MSG_ALG_REGISTER: return cryptomgr_schedule_test(data); case CRYPTO_MSG_ALG_LOADED: break; } return NOTIFY_DONE; } static struct notifier_block cryptomgr_notifier = { .notifier_call = cryptomgr_notify, }; static int __init cryptomgr_init(void) { return crypto_register_notifier(&cryptomgr_notifier); } static void __exit cryptomgr_exit(void) { int err = crypto_unregister_notifier(&cryptomgr_notifier); BUG_ON(err); } /* * This is arch_initcall() so that the crypto self-tests are run on algorithms * registered early by subsys_initcall(). subsys_initcall() is needed for * generic implementations so that they're available for comparison tests when * other implementations are registered later by module_init(). */ arch_initcall(cryptomgr_init); module_exit(cryptomgr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto Algorithm Manager");
186 6 365 751 744 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> struct ctl_table_header; struct mempolicy; /* * This is not completely implemented yet. The idea is to * create an in-memory tree (like the actual /proc filesystem * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * * parent/subdir are used for the directory structure (every /proc file has a * parent, but "subdir" is empty for all non-directory entries). * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { /* * number of callers into module in progress; * negative -> it's going away RSN */ atomic_t in_use; refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); }; proc_write_t write; void *data; unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; struct proc_dir_entry *parent; struct rb_root subdir; struct rb_node subdir_node; char *name; umode_t mode; u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; #define SIZEOF_PDE ( \ sizeof(struct proc_dir_entry) < 128 ? 128 : \ sizeof(struct proc_dir_entry) < 192 ? 192 : \ sizeof(struct proc_dir_entry) < 256 ? 256 : \ sizeof(struct proc_dir_entry) < 512 ? 512 : \ 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) static inline bool pde_is_permanent(const struct proc_dir_entry *pde) { return pde->flags & PROC_ENTRY_PERMANENT; } static inline void pde_make_permanent(struct proc_dir_entry *pde) { pde->flags |= PROC_ENTRY_PERMANENT; } extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); const char *lsm; }; struct proc_inode { struct pid *pid; unsigned int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; /* * General functions */ static inline struct proc_inode *PROC_I(const struct inode *inode) { return container_of(inode, struct proc_inode, vfs_inode); } static inline struct proc_dir_entry *PDE(const struct inode *inode) { return PROC_I(inode)->pde; } static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ #define FIRST_PROCESS_ENTRY 256 /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 /* * array.c */ extern const struct file_operations proc_tid_children_operations; extern void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_status(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); /* * base.c */ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry **parent, void *data); struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline void pde_get(struct proc_dir_entry *pde) { refcount_inc(&pde->refcnt); } extern void pde_put(struct proc_dir_entry *); static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); /* * inode.c */ struct pde_opener { struct list_head lh; struct file *file; bool closing; struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); /* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; /* * proc_net.c */ extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; #ifdef CONFIG_NET extern int proc_net_init(void); #else static inline int proc_net_init(void) { return 0; } #endif /* * proc_self.c */ extern int proc_setup_self(struct super_block *); /* * proc_thread_self.c */ extern int proc_setup_thread_self(struct super_block *); extern void proc_thread_self_init(void); /* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } static inline void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { } #endif /* * proc_tty.c */ #ifdef CONFIG_TTY extern void proc_tty_init(void); #else static inline void proc_tty_init(void) {} #endif /* * root.c */ extern struct proc_dir_entry proc_root; extern void proc_self_init(void); /* * task_[no]mmu.c */ struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; struct vma_iterator iter; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif } __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); extern const struct dentry_operations proc_net_dentry_ops; static inline void pde_force_lookup(struct proc_dir_entry *pde) { /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ pde->proc_dops = &proc_net_dentry_ops; }
82 5 47 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 // SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/perf_event.h> #include <linux/bug.h> #include <linux/stddef.h> #include <asm/perf_regs.h> #include <asm/ptrace.h> #ifdef CONFIG_X86_32 #define PERF_REG_X86_MAX PERF_REG_X86_32_MAX #else #define PERF_REG_X86_MAX PERF_REG_X86_64_MAX #endif #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { PT_REGS_OFFSET(PERF_REG_X86_AX, ax), PT_REGS_OFFSET(PERF_REG_X86_BX, bx), PT_REGS_OFFSET(PERF_REG_X86_CX, cx), PT_REGS_OFFSET(PERF_REG_X86_DX, dx), PT_REGS_OFFSET(PERF_REG_X86_SI, si), PT_REGS_OFFSET(PERF_REG_X86_DI, di), PT_REGS_OFFSET(PERF_REG_X86_BP, bp), PT_REGS_OFFSET(PERF_REG_X86_SP, sp), PT_REGS_OFFSET(PERF_REG_X86_IP, ip), PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), PT_REGS_OFFSET(PERF_REG_X86_CS, cs), PT_REGS_OFFSET(PERF_REG_X86_SS, ss), #ifdef CONFIG_X86_32 PT_REGS_OFFSET(PERF_REG_X86_DS, ds), PT_REGS_OFFSET(PERF_REG_X86_ES, es), PT_REGS_OFFSET(PERF_REG_X86_FS, fs), PT_REGS_OFFSET(PERF_REG_X86_GS, gs), #else /* * The pt_regs struct does not store * ds, es, fs, gs in 64 bit mode. */ (unsigned int) -1, (unsigned int) -1, (unsigned int) -1, (unsigned int) -1, #endif #ifdef CONFIG_X86_64 PT_REGS_OFFSET(PERF_REG_X86_R8, r8), PT_REGS_OFFSET(PERF_REG_X86_R9, r9), PT_REGS_OFFSET(PERF_REG_X86_R10, r10), PT_REGS_OFFSET(PERF_REG_X86_R11, r11), PT_REGS_OFFSET(PERF_REG_X86_R12, r12), PT_REGS_OFFSET(PERF_REG_X86_R13, r13), PT_REGS_OFFSET(PERF_REG_X86_R14, r14), PT_REGS_OFFSET(PERF_REG_X86_R15, r15), #endif }; u64 perf_reg_value(struct pt_regs *regs, int idx) { struct x86_perf_regs *perf_regs; if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { perf_regs = container_of(regs, struct x86_perf_regs, regs); if (!perf_regs->xmm_regs) return 0; return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; } if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) return 0; return regs_get_register(regs, pt_regs_offset[idx]); } #define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \ ~((1ULL << PERF_REG_X86_MAX) - 1)) #ifdef CONFIG_X86_32 #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ (1ULL << PERF_REG_X86_R9) | \ (1ULL << PERF_REG_X86_R10) | \ (1ULL << PERF_REG_X86_R11) | \ (1ULL << PERF_REG_X86_R12) | \ (1ULL << PERF_REG_X86_R13) | \ (1ULL << PERF_REG_X86_R14) | \ (1ULL << PERF_REG_X86_R15)) int perf_reg_validate(u64 mask) { if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) return -EINVAL; return 0; } u64 perf_reg_abi(struct task_struct *task) { return PERF_SAMPLE_REGS_ABI_32; } void perf_get_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); } #else /* CONFIG_X86_64 */ #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ (1ULL << PERF_REG_X86_ES) | \ (1ULL << PERF_REG_X86_FS) | \ (1ULL << PERF_REG_X86_GS)) int perf_reg_validate(u64 mask) { if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) return -EINVAL; return 0; } u64 perf_reg_abi(struct task_struct *task) { if (!user_64bit_mode(task_pt_regs(task))) return PERF_SAMPLE_REGS_ABI_32; else return PERF_SAMPLE_REGS_ABI_64; } static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); void perf_get_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs); struct pt_regs *user_regs = task_pt_regs(current); if (!in_nmi()) { regs_user->regs = user_regs; regs_user->abi = perf_reg_abi(current); return; } /* * If we're in an NMI that interrupted task_pt_regs setup, then * we can't sample user regs at all. This check isn't really * sufficient, though, as we could be in an NMI inside an interrupt * that happened during task_pt_regs setup. */ if (regs->sp > (unsigned long)&user_regs->r11 && regs->sp <= (unsigned long)(user_regs + 1)) { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; return; } /* * These registers are always saved on 64-bit syscall entry. * On 32-bit entry points, they are saved too except r8..r11. */ regs_user_copy->ip = user_regs->ip; regs_user_copy->ax = user_regs->ax; regs_user_copy->cx = user_regs->cx; regs_user_copy->dx = user_regs->dx; regs_user_copy->si = user_regs->si; regs_user_copy->di = user_regs->di; regs_user_copy->r8 = user_regs->r8; regs_user_copy->r9 = user_regs->r9; regs_user_copy->r10 = user_regs->r10; regs_user_copy->r11 = user_regs->r11; regs_user_copy->orig_ax = user_regs->orig_ax; regs_user_copy->flags = user_regs->flags; regs_user_copy->sp = user_regs->sp; regs_user_copy->cs = user_regs->cs; regs_user_copy->ss = user_regs->ss; /* * Store user space frame-pointer value on sample * to facilitate stack unwinding for cases when * user space executable code has such support * enabled at compile time: */ regs_user_copy->bp = user_regs->bp; regs_user_copy->bx = -1; regs_user_copy->r12 = -1; regs_user_copy->r13 = -1; regs_user_copy->r14 = -1; regs_user_copy->r15 = -1; /* * For this to be at all useful, we need a reasonable guess for * the ABI. Be careful: we're in NMI context, and we're * considering current to be the current task, so we should * be careful not to look at any other percpu variables that might * change during context switches. */ regs_user->abi = user_64bit_mode(user_regs) ? PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; regs_user->regs = regs_user_copy; } #endif /* CONFIG_X86_32 */
54 54 54 54 54 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 /* * Cryptographic API. * * Glue code for the SHA512 Secure Hash Algorithm assembler * implementation using supplemental SSE3 / AVX / AVX2 instructions. * * This file is based on sha512_generic.c * * Copyright (C) 2013 Intel Corporation * Author: Tim Chen <tim.c.chen@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, int blocks); static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha512_block_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); if (!crypto_simd_usable() || (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return crypto_sha512_update(desc, data, len); /* * Make sure struct sha512_state begins directly with the SHA512 * 512-bit internal state, as this is what the asm functions expect. */ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); sha512_base_do_update(desc, data, len, sha512_xform); kernel_fpu_end(); return 0; } static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { if (!crypto_simd_usable()) return crypto_sha512_finup(desc, data, len, out); kernel_fpu_begin(); if (len) sha512_base_do_update(desc, data, len, sha512_xform); sha512_base_do_finalize(desc, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); } static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_ssse3); } static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_ssse3); } /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { return sha512_ssse3_finup(desc, NULL, 0, out); } static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int register_sha512_ssse3(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) return crypto_register_shashes(sha512_ssse3_algs, ARRAY_SIZE(sha512_ssse3_algs)); return 0; } static void unregister_sha512_ssse3(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) crypto_unregister_shashes(sha512_ssse3_algs, ARRAY_SIZE(sha512_ssse3_algs)); } asmlinkage void sha512_transform_avx(struct sha512_state *state, const u8 *data, int blocks); static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } return true; } static int sha512_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_avx); } static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_avx); } /* Add padding and return the message digest. */ static int sha512_avx_final(struct shash_desc *desc, u8 *out) { return sha512_avx_finup(desc, NULL, 0, out); } static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx_update, .final = sha512_avx_final, .finup = sha512_avx_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx_update, .final = sha512_avx_final, .finup = sha512_avx_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int register_sha512_avx(void) { if (avx_usable()) return crypto_register_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); return 0; } static void unregister_sha512_avx(void) { if (avx_usable()) crypto_unregister_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); } asmlinkage void sha512_transform_rorx(struct sha512_state *state, const u8 *data, int blocks); static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_rorx); } static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_rorx); } /* Add padding and return the message digest. */ static int sha512_avx2_final(struct shash_desc *desc, u8 *out) { return sha512_avx2_finup(desc, NULL, 0, out); } static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx2_update, .final = sha512_avx2_final, .finup = sha512_avx2_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx2_update, .final = sha512_avx2_final, .finup = sha512_avx2_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static bool avx2_usable(void) { if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) return true; return false; } static int register_sha512_avx2(void) { if (avx2_usable()) return crypto_register_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); return 0; } static const struct x86_cpu_id module_cpu_ids[] = { X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); static void unregister_sha512_avx2(void) { if (avx2_usable()) crypto_unregister_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); } static int __init sha512_ssse3_mod_init(void) { if (!x86_match_cpu(module_cpu_ids)) return -ENODEV; if (register_sha512_ssse3()) goto fail; if (register_sha512_avx()) { unregister_sha512_ssse3(); goto fail; } if (register_sha512_avx2()) { unregister_sha512_avx(); unregister_sha512_ssse3(); goto fail; } return 0; fail: return -ENODEV; } static void __exit sha512_ssse3_mod_fini(void) { unregister_sha512_avx2(); unregister_sha512_avx(); unregister_sha512_ssse3(); } module_init(sha512_ssse3_mod_init); module_exit(sha512_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha512"); MODULE_ALIAS_CRYPTO("sha512-ssse3"); MODULE_ALIAS_CRYPTO("sha512-avx"); MODULE_ALIAS_CRYPTO("sha512-avx2"); MODULE_ALIAS_CRYPTO("sha384"); MODULE_ALIAS_CRYPTO("sha384-ssse3"); MODULE_ALIAS_CRYPTO("sha384-avx"); MODULE_ALIAS_CRYPTO("sha384-avx2");
2643 2481 2479 2480 2268 1 937 939 938 225 939 2 751 168 168 111 779 751 523 751 5 5 1 779 779 779 39 717 136 136 136 77 719 719 718 718 497 497 718 8 1295 1333 304 343 4 4 4 1 1 4 1 4 324 7 7 324 324 3 1 1 1 159 163 748 57 4 3 4 778 779 779 40 779 778 748 749 748 748 749 749 3 747 748 747 748 749 748 748 748 746 748 748 310 749 136 747 746 392 749 749 747 402 747 747 746 747 747 651 746 746 749 748 746 746 2 745 751 746 750 751 751 491 751 751 749 750 750 749 751 751 751 749 749 749 749 749 749 749 749 749 749 749 749 652 652 747 746 747 745 189 744 20 2 2 1 25 1 1 25 1 26 25 25 26 1 25 25 5 1 26 5 24 25 25 25 25 24 24 25 25 25 5 4 5 6 5 1 6 260 259 2 245 244 233 233 232 1 231 1 1 230 1 228 1 228 228 4 3 3 234 1 1 1 1 1 1 1 1 1 1 1 120 82 62 48 48 87 80 79 59 1 1 224 3 222 224 5 2 2 2 1 220 1 1 219 32 32 32 26 213 1 205 205 3 1 203 203 17 203 81 164 164 161 5 160 1 159 158 159 159 159 159 159 1 159 4 2 158 1 157 5 5 5 152 2 152 151 10 182 145 33 4 9 200 2 2 75 73 73 71 2 72 71 75 6 5 5 6 12 10 9 9 7 2 1 8 2 9 174 172 170 2 1 1 170 166 86 171 170 171 171 171 86 86 40 171 164 149 163 160 160 160 5 3 1 2 1 1 1 174 149 1 5 4 5 4 3 2 2 2 1 1 1 1 1 1 2 4 2 2 1 1 1 2 2 54 38 38 55 55 55 55 55 55 55 55 55 55 50 55 55 55 719 413 645 11 11 11 4 3 1 2 1 1 1 4 3 1 3 11 11 11 11 11 13 11 13 14 12 12 12 14 14 14 14 13 13 13 13 13 13 13 4 13 4 4 13 4 13 13 13 13 13 13 14 1 4 2 2 1 1 1 5 1 1 5 40 39 37 37 37 37 40 40 3 40 40 40 17 17 17 40 25 25 25 40 40 37 36 36 1 2 6 34 33 20 19 13 14 1 14 37 9 8 8 1 6 5 3 3 2 3 1 3 9 41 18 17 17 2 17 17 17 17 17 17 17 1 17 17 17 17 17 17 2 2 2 2 2 2 2 41 40 41 41 16 13 12 10 10 10 10 13 33 24 24 21 21 21 21 32 17 17 17 7 25 25 25 25 25 25 25 25 30 5 2 1 4 2 2 1 2 2 2 2 2 43 41 43 53 3 50 53 8 7 6 6 6 8 5 5 39 36 36 38 1 37 36 36 36 36 36 31 36 36 36 36 6 39 6 5 5 4 3 1 1 6 1 1 1 1 1 1 1 1 1 2 2 1 1 3 2 1 1 1 3 3 1 1 1 3 939 938 617 937 244 222 23 22 243 57 243 243 243 694 568 130 128 691 691 89 3 1 1 691 37 37 37 646 648 910 942 1483 1483 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Routing netlink socket interface: protocol independent part. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> #include <linux/bpf.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/udp.h> #include <net/tcp.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/devlink.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/addrconf.h> #endif #include <linux/dpll.h> #include "dev.h" #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; struct module *owner; unsigned int flags; struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock); int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock_killable); static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { if (head && tail) { tail->next = defer_kfree_skb_list; defer_kfree_skb_list = head; } } EXPORT_SYMBOL(rtnl_kfree_skbs); void __rtnl_unlock(void) { struct sk_buff *head = defer_kfree_skb_list; defer_kfree_skb_list = NULL; /* Ensure that we didn't actually add any TODO item when __rtnl_unlock() * is used. In some places, e.g. in cfg80211, we have code that will do * something like * rtnl_lock() * wiphy_lock() * ... * rtnl_unlock() * * and because netdev_run_todo() acquires the RTNL for items on the list * we could cause a situation such as this: * Thread 1 Thread 2 * rtnl_lock() * unregister_netdevice() * __rtnl_unlock() * rtnl_lock() * wiphy_lock() * rtnl_unlock() * netdev_run_todo() * __rtnl_unlock() * * // list not empty now * // because of thread 2 * rtnl_lock() * while (!list_empty(...)) * rtnl_lock() * wiphy_lock() * **** DEADLOCK **** * * However, usage of __rtnl_unlock() is rare, and so we can ensure that * it's not used in cases where something is added to do the list. */ WARN_ON(!list_empty(&net_todo_list)); mutex_unlock(&rtnl_mutex); while (head) { struct sk_buff *next = head->next; kfree_skb(head); cond_resched(); head = next; } } void rtnl_unlock(void) { /* This fellow will unlock it for us. */ netdev_run_todo(); } EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_is_locked); bool refcount_dec_and_rtnl_lock(refcount_t *r) { return refcount_dec_and_mutex_lock(r, &rtnl_mutex); } EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { int msgindex = msgtype - RTM_BASE; /* * msgindex < 0 implies someone tried to register a netlink * control code. msgindex >= RTM_NR_MSGTYPES may indicate that * the message type has not been added to linux/rtnetlink.h */ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); return msgindex; } static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { struct rtnl_link *link, *old; struct rtnl_link __rcu **tab; int msgindex; int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) goto unlock; /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } old = rtnl_dereference(tab[msgindex]); if (old) { link = kmemdup(old, sizeof(*old), GFP_KERNEL); if (!link) goto unlock; } else { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) goto unlock; } WARN_ON(link->owner && link->owner != owner); link->owner = owner; WARN_ON(doit && link->doit && link->doit != doit); if (doit) link->doit = doit; WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) link->dumpit = dumpit; WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL && (flags & RTNL_FLAG_BULK_DEL_SUPPORTED)); link->flags |= flags; /* publish protocol:msgtype */ rcu_assign_pointer(tab[msgindex], link); ret = 0; if (old) kfree_rcu(old, rcu); unlock: rtnl_unlock(); return ret; } /** * rtnl_register_module - Register a rtnetlink message type * * @owner: module registering the hook (THIS_MODULE) * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Like rtnl_register, but for use by removable modules. */ int rtnl_register_module(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { return rtnl_register_internal(owner, protocol, msgtype, doit, dumpit, flags); } EXPORT_SYMBOL_GPL(rtnl_register_module); /** * rtnl_register - Register a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the * specified protocol family and message type is received. * * The special protocol family PF_UNSPEC may be used to define fallback * function pointers for the case when no entry for the specific protocol * family exists. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { int err; err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, flags); if (err) pr_err("Unable to register rtnetlink message handler, " "protocol = %d, message type = %d\n", protocol, msgtype); } /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return -ENOENT; } link = rtnl_dereference(tab[msgindex]); RCU_INIT_POINTER(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); return 0; } EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * * Identical to calling rtnl_unregster() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return; } RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { link = rtnl_dereference(tab[msgindex]); if (!link) continue; RCU_INIT_POINTER(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); synchronize_net(); kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); static LIST_HEAD(link_ops); static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) { const struct rtnl_link_ops *ops; list_for_each_entry(ops, &link_ops, list) { if (!strcmp(ops->kind, kind)) return ops; } return NULL; } /** * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * The caller must hold the rtnl_mutex. This function should be used * by drivers that create devices during module initialization. It * must be called before registering the devices. * * Returns 0 on success or a negative error code. */ int __rtnl_link_register(struct rtnl_link_ops *ops) { if (rtnl_link_ops_get(ops->kind)) return -EEXIST; /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); return 0; } EXPORT_SYMBOL_GPL(__rtnl_link_register); /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * Returns 0 on success or a negative error code. */ int rtnl_link_register(struct rtnl_link_ops *ops) { int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; rtnl_lock(); err = __rtnl_link_register(ops); rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; LIST_HEAD(list_kill); for_each_netdev(net, dev) { if (dev->rtnl_link_ops == ops) ops->dellink(dev, &list_kill); } unregister_netdevice_many(&list_kill); } /** * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister * * The caller must hold the rtnl_mutex and guarantee net_namespace_list * integrity (hold pernet_ops_rwsem for writing to close the race * with setup_net() and cleanup_net()). */ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; for_each_net(net) { __rtnl_kill_links(net, ops); } list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ static void rtnl_lock_unregistering_all(void) { struct net *net; bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { unregistering = false; rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { if (atomic_read(&net->dev_unreg_count) > 0) { unregistering = true; break; } } if (!unregistering) break; __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&netdev_unregistering_wq, &wait); } /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; size_t size = 0; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (!master_dev) goto out; ops = master_dev->rtnl_link_ops; if (!ops || !ops->get_slave_size) goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ size = nla_total_size(sizeof(struct nlattr)) + ops->get_slave_size(master_dev, dev); out: rcu_read_unlock(); return size; } static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; size_t size; if (!ops) return 0; size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); size += rtnl_link_get_slave_info_data_size(dev); return size; } static LIST_HEAD(rtnl_af_ops); static const struct rtnl_af_ops *rtnl_af_lookup(const int family) { const struct rtnl_af_ops *ops; ASSERT_RTNL(); list_for_each_entry(ops, &rtnl_af_ops, list) { if (ops->family == family) return ops; } return NULL; } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Returns 0 on success or a negative error code. */ void rtnl_af_register(struct rtnl_af_ops *ops) { rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_register); /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); list_del_rcu(&ops->list); rtnl_unlock(); synchronize_rcu(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); static size_t rtnl_link_get_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } rcu_read_unlock(); return size; } static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; bool ret = false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) ret = true; rcu_read_unlock(); return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; struct nlattr *slave_data; int err; master_dev = netdev_master_upper_dev_get((struct net_device *) dev); if (!master_dev) return 0; ops = master_dev->rtnl_link_ops; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_slave_info) { slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); if (!slave_data) return -EMSGSIZE; err = ops->fill_slave_info(skb, master_dev, dev); if (err < 0) goto err_cancel_slave_data; nla_nest_end(skb, slave_data); } return 0; err_cancel_slave_data: nla_nest_cancel(skb, slave_data); return err; } static int rtnl_link_info_fill(struct sk_buff *skb, const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; struct nlattr *data; int err; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) return err; } if (ops->fill_info) { data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); if (data == NULL) return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } return 0; err_cancel_data: nla_nest_cancel(skb, data); return err; } static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *linkinfo; int err = -EMSGSIZE; linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); if (linkinfo == NULL) goto out; err = rtnl_link_info_fill(skb, dev); if (err < 0) goto err_cancel_link; err = rtnl_link_slave_info_fill(skb, dev); if (err < 0) goto err_cancel_link; nla_nest_end(skb, linkinfo); return 0; err_cancel_link: nla_nest_cancel(skb, linkinfo); out: return err; } int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) { struct sock *rtnl = net->rtnl; return nlmsg_unicast(rtnl, skb, pid); } EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { struct sock *rtnl = net->rtnl; netlink_set_err(rtnl, 0, group, error); } EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { struct nlattr *mx; int i, valid = 0; /* nothing is dumped for dst_default_metrics, so just skip the loop */ if (metrics == dst_default_metrics.metrics) return 0; mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { if (i == RTAX_CC_ALGO - 1) { char tmp[TCP_CA_NAME_MAX], *name; name = tcp_ca_get_name_by_key(metrics[i], tmp); if (!name) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; if (!user_features) continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; } valid++; } } if (!valid) { nla_nest_cancel(skb, mx); return 0; } return nla_nest_end(skb, mx); nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { .rta_error = error, .rta_id = id, }; if (dst) { ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); ci.rta_used = dst->__use; ci.rta_clntref = rcuref_read(&dst->__rcuref); } if (expires) { unsigned long clock; clock = jiffies_to_clock_t(abs(expires)); clock = min_t(unsigned long, clock, INT_MAX); ci.rta_expires = (expires > 0) ? clock : -clock; } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = dev->operstate; switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; case IF_OPER_TESTING: if (netif_oper_up(dev)) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: if (netif_oper_up(dev)) operstate = IF_OPER_DORMANT; break; } if (dev->operstate != operstate) { write_lock(&dev_base_lock); dev->operstate = operstate; write_unlock(&dev_base_lock); netdev_state_change(dev); } } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) { return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); } static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int flags = ifm->ifi_flags; /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } static void copy_rtnl_link_stats(struct rtnl_link_stats *a, const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; a->rx_bytes = b->rx_bytes; a->tx_bytes = b->tx_bytes; a->rx_errors = b->rx_errors; a->tx_errors = b->tx_errors; a->rx_dropped = b->rx_dropped; a->tx_dropped = b->tx_dropped; a->multicast = b->multicast; a->collisions = b->collisions; a->rx_length_errors = b->rx_length_errors; a->rx_over_errors = b->rx_over_errors; a->rx_crc_errors = b->rx_crc_errors; a->rx_frame_errors = b->rx_frame_errors; a->rx_fifo_errors = b->rx_fifo_errors; a->rx_missed_errors = b->rx_missed_errors; a->tx_aborted_errors = b->tx_aborted_errors; a->tx_carrier_errors = b->tx_carrier_errors; a->tx_fifo_errors = b->tx_fifo_errors; a->tx_heartbeat_errors = b->tx_heartbeat_errors; a->tx_window_errors = b->tx_window_errors; a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; a->rx_nohandler = b->rx_nohandler; } /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + nla_total_size(sizeof(struct ifla_vf_trust))); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { size += num_vfs * (nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_DROPPED */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } return size; } else return 0; } static size_t rtnl_port_size(const struct net_device *dev, u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ + nla_total_size(2); /* PORT_VDP_RESPONSE */ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + port_size; size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + vf_port_size * dev_num_vf(dev->dev.parent); else return port_self_size; } static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; size_t size; if (list_empty(&dev->name_node->list)) return 0; size = nla_total_size(0); list_for_each_entry(name_node, &dev->name_node->list, list) size += nla_total_size(ALTIFNAMSIZ); return size; } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); if (dev->proto_down_reason) size += nla_total_size(0) + nla_total_size(4); return size; } static size_t rtnl_devlink_port_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */ if (dev->devlink_port) size += devlink_nl_port_handle_size(dev->devlink_port); return size; } static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ size += dpll_msg_pin_handle_size(netdev_dpll_pin(dev)); return size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ + nla_total_size(4) /* IFLA_WEIGHT */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + 0; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *vf_ports; struct nlattr *vf_port; int vf; int err; vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) goto nla_put_failure; err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); if (err == -EMSGSIZE) goto nla_put_failure; if (err) { nla_nest_cancel(skb, vf_port); continue; } nla_nest_end(skb, vf_port); } nla_nest_end(skb, vf_ports); return 0; nla_put_failure: nla_nest_cancel(skb, vf_ports); return -EMSGSIZE; } static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *port_self; int err; port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); if (err) { nla_nest_cancel(skb, port_self); return (err == -EMSGSIZE) ? err : 0; } nla_nest_end(skb, port_self); return 0; } static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { int err; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); if (err) return err; if (dev_num_vf(dev->dev.parent)) { err = rtnl_vf_ports_fill(skb, dev); if (err) return err; } return 0; } static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) { char name[IFNAMSIZ]; int err; err = dev_get_phys_port_name(dev, name, sizeof(name)); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; } static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { struct netdev_phys_item_id ppid = { }; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_stats64 *sp; struct nlattr *attr; attr = nla_reserve_64bit(skb, IFLA_STATS64, sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; sp = nla_data(attr); dev_get_stats(dev, sp); attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; copy_rtnl_link_stats(nla_data(attr), sp); return 0; } static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; struct ifla_vf_guid node_guid; struct ifla_vf_guid port_guid; memset(&ivi, 0, sizeof(ivi)); /* Not all SR-IOV capable drivers support the * spoofcheck and "RSS query enable" query. Preset to * -1 so the user space tool can detect that the driver * didn't report anything. */ ivi.spoofchk = -1; ivi.rss_query_en = -1; ivi.trusted = -1; /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; /* VLAN Protocol by default is 802.1Q */ ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); memset(&node_guid, 0, sizeof(node_guid)); memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = vf_rss_query_en.vf = vf_trust.vf = node_guid.vf = port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; vf_vlan_info.qos = ivi.qos; vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), &vf_linkstate) || nla_put(skb, IFLA_VF_RSS_QUERY_EN, sizeof(vf_rss_query_en), &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; if (dev->netdev_ops->ndo_get_vf_guid && !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, &port_guid)) { if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), &node_guid) || nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), &port_guid)) goto nla_put_vf_failure; } vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), &vf_vlan_info)) { nla_nest_cancel(skb, vfvlanlist); goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); if (!vfstats) goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, vf_stats.tx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, vf_stats.multicast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } nla_nest_end(skb, vfstats); } nla_nest_end(skb, vf); return 0; nla_put_vf_failure: nla_nest_cancel(skb, vf); return -EMSGSIZE; } static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { struct nlattr *vfinfo; int i, num_vfs; if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) return 0; num_vfs = dev_num_vf(dev->dev.parent); if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) return -EMSGSIZE; if (!dev->netdev_ops->ndo_get_vf_config) return 0; vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); if (!vfinfo) return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } } nla_nest_end(skb, vfinfo); return 0; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); map.mem_start = dev->mem_start; map.mem_end = dev->mem_end; map.base_addr = dev->base_addr; map.irq = dev->irq; map.dma = dev->dma; map.port = dev->if_port; if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; } static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; ASSERT_RTNL(); generic_xdp_prog = rtnl_dereference(dev->xdp_prog); if (!generic_xdp_prog) return 0; return generic_xdp_prog->aux->id; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, u32 (*get_prog_id)(struct net_device *dev)) { u32 curr_id; int err; curr_id = get_prog_id(dev); if (!curr_id) return 0; *prog_id = curr_id; err = nla_put_u32(skb, attr, curr_id); if (err) return err; if (*mode != XDP_ATTACHED_NONE) *mode = XDP_ATTACHED_MULTI; else *mode = tgt_mode; return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; u32 prog_id; int err; u8 mode; xdp = nla_nest_start_noflag(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; prog_id = 0; mode = XDP_ATTACHED_NONE; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); if (err) goto err_cancel; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; } nla_nest_end(skb, xdp); return 0; err_cancel: nla_nest_cancel(skb, xdp); return err; } static u32 rtnl_get_event(unsigned long event) { u32 rtnl_event_type = IFLA_EVENT_NONE; switch (event) { case NETDEV_REBOOT: rtnl_event_type = IFLA_EVENT_REBOOT; break; case NETDEV_FEAT_CHANGE: rtnl_event_type = IFLA_EVENT_FEATURES; break; case NETDEV_BONDING_FAILOVER: rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; break; case NETDEV_NOTIFY_PEERS: rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; break; case NETDEV_RESEND_IGMP: rtnl_event_type = IFLA_EVENT_IGMP_RESEND; break; case NETDEV_CHANGEINFODATA: rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; break; default: break; } return rtnl_event_type; } static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) { const struct net_device *upper_dev; int ret = 0; rcu_read_lock(); upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); rcu_read_unlock(); return ret; } static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { int ifindex = dev_get_iflink(dev); if (force || dev->ifindex != ifindex) return nla_put_u32(skb, IFLA_LINK, ifindex); return 0; } static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev) { char buf[IFALIASZ]; int ret; ret = dev_get_alias(dev, buf, sizeof(buf)); return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; } static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, struct net *src_net, gfp_t gfp) { bool put_iflink = false; if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; put_iflink = true; } } return nla_put_iflink(skb, dev, put_iflink); } static int rtnl_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { const struct rtnl_af_ops *af_ops; struct nlattr *af_spec; af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af_spec) return -EMSGSIZE; list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { struct nlattr *af; int err; if (!af_ops->fill_link_af) continue; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) return -EMSGSIZE; err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there * was no data to be dumped. This is not an error, it * means we should trim the attribute header and * continue. */ if (err == -ENODATA) nla_nest_cancel(skb, af); else if (err < 0) return -EMSGSIZE; nla_nest_end(skb, af); } nla_nest_end(skb, af_spec); return 0; } static int rtnl_fill_alt_ifnames(struct sk_buff *skb, const struct net_device *dev) { struct netdev_name_node *name_node; int count = 0; list_for_each_entry(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; } return count; } static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *prop_list; int ret; prop_list = nla_nest_start(skb, IFLA_PROP_LIST); if (!prop_list) return -EMSGSIZE; ret = rtnl_fill_alt_ifnames(skb, dev); if (ret <= 0) goto nest_cancel; nla_nest_end(skb, prop_list); return 0; nest_cancel: nla_nest_cancel(skb, prop_list); return ret; } static int rtnl_fill_proto_down(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *pr; u32 preason; if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; preason = dev->proto_down_reason; if (!preason) return 0; pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); if (!pr) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { nla_nest_cancel(skb, pr); goto nla_put_failure; } nla_nest_end(skb, pr); return 0; nla_put_failure: return -EMSGSIZE; } static int rtnl_fill_devlink_port(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *devlink_port_nest; int ret; devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT); if (!devlink_port_nest) return -EMSGSIZE; if (dev->devlink_port) { ret = devlink_nl_port_handle_fill(skb, dev->devlink_port); if (ret < 0) goto nest_cancel; } nla_nest_end(skb, devlink_port_nest); return 0; nest_cancel: nla_nest_cancel(skb, devlink_port_nest); return ret; } static int rtnl_fill_dpll_pin(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *dpll_pin_nest; int ret; dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN); if (!dpll_pin_nest) return -EMSGSIZE; ret = dpll_msg_add_pin_handle(skb, netdev_dpll_pin(dev)); if (ret < 0) goto nest_cancel; nla_nest_end(skb, dpll_pin_nest); return 0; nest_cancel: nla_nest_cancel(skb, dpll_pin_nest); return ret; } static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, dev->gso_ipv4_max_size) || nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, dev->gro_ipv4_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; } if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) goto nla_put_failure; } if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_port_name_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_xdp_fill(skb, dev)) goto nla_put_failure; if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) goto nla_put_failure; if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; if (new_ifindex && nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; rcu_read_unlock(); if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure; if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, dev_name(dev->dev.parent))) goto nla_put_failure; if (dev->dev.parent && dev->dev.parent->bus && nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, dev->dev.parent->bus->name)) goto nla_put_failure; if (rtnl_fill_devlink_port(skb, dev)) goto nla_put_failure; if (rtnl_fill_dpll_pin(skb, dev)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to * allow 0-length string (needed to remove an alias). */ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, [IFLA_GSO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, /* Unused, but we need to keep it here since user space could * fill it. It's also broken with regard to NLA_BINARY use in * combination with structs. */ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) { const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } return ops; } static bool link_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = netdev_master_upper_dev_get(dev); /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need * another invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool link_kind_filtered(const struct net_device *dev, const struct rtnl_link_ops *kind_ops) { if (kind_ops && dev->rtnl_link_ops != kind_ops) return true; return false; } static bool link_dump_filtered(struct net_device *dev, int master_idx, const struct rtnl_link_ops *kind_ops) { if (link_master_filtered(dev, master_idx) || link_kind_filtered(dev, kind_ops)) return true; return false; } /** * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. * @sk: netlink socket * @netnsid: network namespace identifier * * Returns the network namespace identified by netnsid on success or an error * pointer on failure. */ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } return net; } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, bool strict_check, struct nlattr **tb, struct netlink_ext_ack *extack) { int hdrlen; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); return -EINVAL; } if (ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); return -EINVAL; } return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); } /* A hack to preserve kernel<->userspace interface. * The correct header is ifinfomsg. It is consistent with rtnl_getlink. * However, before Linux v3.9 the code here assumed rtgenmsg and that's * what iproute2 < v3.9.0 used. * We can detect the old iproute2. Even including the IFLA_EXT_MASK * attribute, its netlink message is shorter than struct ifinfomsg. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); } static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; int h, s_h; int idx = 0, s_idx; struct net_device *dev; struct hlist_head *head; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; const struct rtnl_link_ops *kind_ops = NULL; unsigned int flags = NLM_F_MULTI; int master_idx = 0; int netnsid = -1; int err, i; s_h = cb->args[0]; s_idx = cb->args[1]; err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) return err; goto walk_entries; } for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; /* new attributes should only be added with strict checking */ switch (i) { case IFLA_TARGET_NETNSID: netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); return PTR_ERR(tgt_net); } break; case IFLA_EXT_MASK: ext_filter_mask = nla_get_u32(tb[i]); break; case IFLA_MASTER: master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: kind_ops = linkinfo_to_kind_ops(tb[i]); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); return -EINVAL; } } } if (master_idx || kind_ops) flags |= NLM_F_DUMP_FILTERED; walk_entries: for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &tgt_net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (link_dump_filtered(dev, master_idx, kind_ops)) goto cont; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { if (likely(skb->len)) goto out; goto out_err; } cont: idx++; } } out: err = skb->len; out_err: cb->args[1] = idx; cb->args[0] = h; cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr) { const struct ifinfomsg *ifmp; const struct nlattr *attrs; size_t len; ifmp = nla_data(nla_peer); attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); len = nla_len(nla_peer) - sizeof(struct ifinfomsg); if (ifmp->ifi_index < 0) { NL_SET_ERR_MSG_ATTR(exterr, nla_peer, "ifindex can't be negative"); return -EINVAL; } return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net; /* Examine the link attributes and figure out which * network namespace we are talking about. */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); else net = get_net(src_net); return net; } EXPORT_SYMBOL(rtnl_link_get_net); /* Figure out which network namespace we are talking about by * examining the link attributes in the following order: * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) { struct net *net; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); return net; } static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, struct net *src_net, struct nlattr *tb[], int cap) { struct net *net; net = rtnl_link_get_net_by_nlattr(src_net, tb); if (IS_ERR(net)) return net; if (!netlink_ns_capable(skb, net->user_ns, cap)) { put_net(net); return ERR_PTR(-EPERM); } return net; } /* Verify that rtnetlink requests do not pass additional properties * potentially referring to different network namespaces. */ static int rtnl_ensure_unique_netns(struct nlattr *tb[], struct netlink_ext_ack *extack, bool netns_id_only) { if (netns_id_only) { if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) return 0; NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); return -EOPNOTSUPP; } if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; invalid_attr: NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); return -EINVAL; } static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_set_vf_rate) return -EOPNOTSUPP; if (max_tx_rate && max_tx_rate < min_tx_rate) return -EINVAL; return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); } static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS] && nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) return -EINVAL; if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; if (tb[IFLA_GSO_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_max_size"); return -EINVAL; } if (tb[IFLA_GSO_MAX_SEGS] && (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { NL_SET_ERR_MSG(extack, "too big gso_max_segs"); return -EINVAL; } if (tb[IFLA_GRO_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_max_size"); return -EINVAL; } if (tb[IFLA_GSO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_GRO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; af_ops = rtnl_af_lookup(nla_type(af)); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) return -EOPNOTSUPP; if (af_ops->validate_link_af) { err = af_ops->validate_link_af(dev, af, extack); if (err < 0) return err; } } } return 0; } static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { const struct net_device_ops *ops = dev->netdev_ops; return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); } static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { if (dev->type != ARPHRD_INFINIBAND) return -EOPNOTSUPP; return handle_infiniband_guid(dev, ivt, guid_type); } static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; int err = -EINVAL; if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); if (ivm->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); if (err < 0) return err; } if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); if (ivv->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, ivv->qos, htons(ETH_P_8021Q)); if (err < 0) return err; } if (tb[IFLA_VF_VLAN_LIST]) { struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; struct nlattr *attr; int rem, len = 0; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_vlan) return err; nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || nla_len(attr) < NLA_HDRLEN) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) return -EOPNOTSUPP; ivvl[len] = nla_data(attr); len++; } if (len == 0) return -EINVAL; if (ivvl[0]->vf >= INT_MAX) return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } if (tb[IFLA_VF_TX_RATE]) { struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); if (err < 0) return err; err = rtnl_set_vf_rate(dev, ivt->vf, ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); if (ivt->vf >= INT_MAX) return -EINVAL; err = rtnl_set_vf_rate(dev, ivt->vf, ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); if (ivs->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, ivs->setting); if (err < 0) return err; } if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); if (ivl->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, ivl->link_state); if (err < 0) return err; } if (tb[IFLA_VF_RSS_QUERY_EN]) { struct ifla_vf_rss_query_en *ivrssq_en; err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); if (ivrssq_en->vf >= INT_MAX) return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); if (err < 0) return err; } if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); if (err < 0) return err; } if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); } return err; } static int do_set_master(struct net_device *dev, int ifindex, struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; } else { return -EOPNOTSUPP; } } if (ifindex) { upper_dev = __dev_get_by_index(dev_net(dev), ifindex); if (!upper_dev) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { err = ops->ndo_add_slave(upper_dev, dev, extack); if (err) return err; } else { return -EOPNOTSUPP; } } return 0; } static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, }; static int do_set_proto_down(struct net_device *dev, struct nlattr *nl_proto_down, struct nlattr *nl_proto_down_reason, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; unsigned long mask = 0; u32 value; bool proto_down; int err; if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } if (nl_proto_down_reason) { err = nla_parse_nested_deprecated(pdreason, IFLA_PROTO_DOWN_REASON_MAX, nl_proto_down_reason, ifla_proto_down_reason_policy, NULL); if (err < 0) return err; if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); return -EINVAL; } value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); dev_change_proto_down_reason(dev, mask, value); } if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } err = dev_change_proto_down(dev, proto_down); if (err) return err; } return 0; } #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { const struct net_device_ops *ops = dev->netdev_ops; char ifname[IFNAMSIZ]; int err; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { const char *pat = ifname[0] ? ifname : NULL; struct net *net; int new_ifindex; net = rtnl_link_get_net_capable(skb, dev_net(dev), tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } if (tb[IFLA_NEW_IFINDEX]) new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); else new_ifindex = 0; err = __dev_change_net_namespace(dev, net, pat, new_ifindex); put_net(net); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; if (!ops->ndo_set_config) { err = -EOPNOTSUPP; goto errout; } if (!netif_device_present(dev)) { err = -ENODEV; goto errout; } u_map = nla_data(tb[IFLA_MAP]); k_map.mem_start = (unsigned long) u_map->mem_start; k_map.mem_end = (unsigned long) u_map->mem_end; k_map.base_addr = (unsigned short) u_map->base_addr; k_map.irq = (unsigned char) u_map->irq; k_map.dma = (unsigned char) u_map->dma; k_map.port = (unsigned char) u_map->port; err = ops->ndo_set_config(dev, &k_map); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { struct sockaddr *sa; int len; len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; goto errout; } sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); err = dev_set_mac_address_user(dev, sa, extack); kfree(sa); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MTU]) { err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } /* * Interface selected by interface index but interface * name provided implies that a name change has been * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { err = dev_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (ifm->ifi_flags || ifm->ifi_change) { err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); if (err < 0) goto errout; } if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); err = dev_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); if (dev->gso_max_segs ^ max_segs) { netif_set_gso_max_segs(dev, max_segs); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GRO_MAX_SIZE]) { u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); if (dev->gro_max_size ^ gro_max_size) { netif_set_gro_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_IPV4_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]); if (dev->gso_ipv4_max_size ^ max_size) { netif_set_gso_ipv4_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GRO_IPV4_MAX_SIZE]) { u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]); if (dev->gro_ipv4_max_size ^ gro_max_size) { netif_set_gro_ipv4_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; dev->link_mode = value; write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { struct nlattr *vfinfo[IFLA_VF_MAX + 1]; struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { if (nla_type(attr) != IFLA_VF_INFO || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, attr, ifla_vf_policy, NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_VF_PORTS]) { struct nlattr *port[IFLA_PORT_MAX+1]; struct nlattr *attr; int vf; int rem; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_port) goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { if (nla_type(attr) != IFLA_VF_PORT || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, attr, ifla_port_policy, NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { err = -EOPNOTSUPP; goto errout; } vf = nla_get_u32(port[IFLA_PORT_VF]); err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PORT_SELF]) { struct nlattr *port[IFLA_PORT_MAX+1]; err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], ifla_port_policy, NULL); if (err < 0) goto errout; err = -EOPNOTSUPP; if (ops->ndo_set_vf_port) err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); err = af_ops->set_link_af(dev, af, extack); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; u32 xdp_flags = 0; err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy, NULL); if (err < 0) goto errout; if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { err = -EINVAL; goto errout; } if (xdp[IFLA_XDP_FLAGS]) { xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); if (xdp_flags & ~XDP_FLAGS_MASK) { err = -EINVAL; goto errout; } if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { err = -EINVAL; goto errout; } } if (xdp[IFLA_XDP_FD]) { int expected_fd = -1; if (xdp_flags & XDP_FLAGS_REPLACE) { if (!xdp[IFLA_XDP_EXPECTED_FD]) { err = -EINVAL; goto errout; } expected_fd = nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); } err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), expected_fd, xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } } errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) netdev_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } return err; } static struct net_device *rtnl_dev_get(struct net *net, struct nlattr *tb[]) { char ifname[ALTIFNAMSIZ]; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else if (tb[IFLA_ALT_IFNAME]) nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); else return NULL; return __dev_get_by_name(net, ifname); } static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; int err; struct nlattr *tb[IFLA_MAX+1]; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) goto errout; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) goto errout; err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else goto errout; if (dev == NULL) { err = -ENODEV; goto errout; } err = validate_linkmsg(dev, tb, extack); if (err < 0) goto errout; err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; } static int rtnl_group_dellink(const struct net *net, int group) { struct net_device *dev, *aux; LIST_HEAD(list_kill); bool found = false; if (!group) return -EPERM; for_each_netdev(net, dev) { if (dev->group == group) { const struct rtnl_link_ops *ops; found = true; ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; } } if (!found) return -ENODEV; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { const struct rtnl_link_ops *ops; ops = dev->rtnl_link_ops; ops->dellink(dev, &list_kill); } } unregister_netdevice_many(&list_kill); return 0; } int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh) { const struct rtnl_link_ops *ops; LIST_HEAD(list_kill); ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); unregister_netdevice_many_notify(&list_kill, portid, nlh); return 0; } EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; int err; int netnsid = -1; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else goto out; if (!dev) { if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0) err = -ENODEV; goto out; } err = rtnl_delete_link(dev, portid, nlh); out: if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { unsigned int old_flags; int err; old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), NULL); if (err < 0) return err; } if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); } else { dev->rtnl_link_state = RTNL_LINK_INITIALIZED; __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); } return 0; } EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); else if (ops->get_num_tx_queues) num_tx_queues = ops->get_num_tx_queues(); if (tb[IFLA_NUM_RX_QUEUES]) num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); if (num_tx_queues < 1 || num_tx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); } if (num_rx_queues < 1 || num_rx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); } if (ops->alloc) { dev = ops->alloc(tb, ifname, name_assign_type, num_tx_queues, num_rx_queues); if (IS_ERR(dev)) return dev; } else { dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); } if (!dev) return ERR_PTR(-ENOMEM); err = validate_linkmsg(dev, tb, extack); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); err = dev_validate_mtu(dev, mtu, extack); if (err) { free_netdev(dev); return ERR_PTR(err); } dev->mtu = mtu; } if (tb[IFLA_ADDRESS]) { __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); dev->addr_assign_type = NET_ADDR_SET; } if (tb[IFLA_BROADCAST]) memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), nla_len(tb[IFLA_BROADCAST])); if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); if (tb[IFLA_GRO_MAX_SIZE]) netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); if (tb[IFLA_GSO_IPV4_MAX_SIZE]) netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE])); if (tb[IFLA_GRO_IPV4_MAX_SIZE]) netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE])); return dev; } EXPORT_SYMBOL(rtnl_create_link); static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, int group, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; err = do_setlink(skb, dev, ifm, extack, tb, 0); if (err < 0) return err; } } return 0; } static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; struct net *dest_net, *link_net; struct net_device *dev; char ifname[IFNAMSIZ]; int err; if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (tb[IFLA_IFNAME]) { nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); } else { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); name_assign_type = NET_NAME_ENUM; } dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); link_net = get_net_ns_by_id(dest_net, id); if (!link_net) { NL_SET_ERR_MSG(extack, "Unknown network namespace id"); err = -EINVAL; goto out; } err = -EPERM; if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) goto out; } else { link_net = NULL; } dev = rtnl_create_link(link_net ? : dest_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; } dev->ifindex = ifm->ifi_index; if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); else err = register_netdevice(dev); if (err < 0) { free_netdev(dev); goto out; } err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; if (link_net) { err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } out: if (link_net) put_net(link_net); put_net(dest_net); return err; out_unregister: if (ops->newlink) { LIST_HEAD(list_kill); ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); } else { unregister_netdevice(dev); } goto out; } struct rtnl_newlink_tbs { struct nlattr *tb[IFLA_MAX + 1]; struct nlattr *attr[RTNL_MAX_TYPE + 1]; struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; }; static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct rtnl_newlink_tbs *tbs, struct netlink_ext_ack *extack) { struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr ** const tb = tbs->tb; const struct rtnl_link_ops *m_ops; struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; struct nlattr **slave_data; char kind[MODULE_NAME_LEN]; struct net_device *dev; struct ifinfomsg *ifm; struct nlattr **data; bool link_specified; int err; #ifdef CONFIG_MODULES replay: #endif err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; dev = __dev_get_by_index(net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; dev = rtnl_dev_get(net, tb); } else { link_specified = false; dev = NULL; } master_dev = NULL; m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) m_ops = master_dev->rtnl_link_ops; } if (tb[IFLA_LINKINFO]) { err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); if (err < 0) return err; } else memset(linkinfo, 0, sizeof(linkinfo)); if (linkinfo[IFLA_INFO_KIND]) { nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } else { kind[0] = '\0'; ops = NULL; } data = NULL; if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) return -EINVAL; if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (err < 0) return err; data = tbs->attr; } if (ops->validate) { err = ops->validate(tb, data, extack); if (err < 0) return err; } } slave_data = NULL; if (m_ops) { if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) return -EINVAL; if (m_ops->slave_maxtype && linkinfo[IFLA_INFO_SLAVE_DATA]) { err = nla_parse_nested_deprecated(tbs->slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, extack); if (err < 0) return err; slave_data = tbs->slave_attr; } } if (dev) { int status = 0; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) return -EOPNOTSUPP; err = ops->changelink(dev, tb, data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { if (!m_ops || !m_ops->slave_changelink) return -EOPNOTSUPP; err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } return do_setlink(skb, dev, ifm, extack, tb, status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, * or it's for a group */ if (link_specified) return -ENODEV; if (tb[IFLA_GROUP]) return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); return -ENODEV; } if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; if (!ops) { #ifdef CONFIG_MODULES if (kind[0]) { __rtnl_unlock(); request_module("rtnl-link-%s", kind); rtnl_lock(); ops = rtnl_link_ops_get(kind); if (ops) goto replay; } #endif NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct rtnl_newlink_tbs *tbs; int ret; tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); if (!tbs) return -ENOMEM; ret = __rtnl_newlink(skb, nlh, tbs, extack); kfree(tbs); return ret; } static int rtnl_valid_getlink_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; for (i = 0; i <= IFLA_MAX; i++) { if (!tb[i]) continue; switch (i) { case IFLA_IFNAME: case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); return -EINVAL; } } return 0; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; int netnsid = -1; int err; u32 ext_filter_mask = 0; err = rtnl_valid_getlink_req(skb, nlh, tb, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); else goto out; err = -ENODEV; if (dev == NULL) goto out; err = -ENOBUFS; nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask)); if (nskb == NULL) goto out; err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); out: if (netnsid >= 0) put_net(tgt_net); return err; } static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; if (cmd == RTM_NEWLINKPROP) { size = rtnl_prop_list_size(dev); size += nla_total_size(ALTIFNAMSIZ); if (size >= U16_MAX) { NL_SET_ERR_MSG(extack, "effective property list too long"); return -EINVAL; } } alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; if (cmd == RTM_NEWLINKPROP) { err = netdev_name_node_alt_create(dev, alt_ifname); if (!err) alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); } else { WARN_ON_ONCE(1); err = -EINVAL; } kfree(alt_ifname); if (!err) *changed = true; return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; struct ifinfomsg *ifm; bool changed = false; struct nlattr *attr; int err, rem; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else return -EINVAL; if (!dev) return -ENODEV; if (!tb[IFLA_PROP_LIST]) return 0; nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { switch (nla_type(attr)) { case IFLA_ALT_IFNAME: err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); if (err) return err; break; } } if (changed) netdev_state_change(dev); return 0; } static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); } static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); size_t min_ifinfo_dump_size = 0; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; struct net_device *dev; int hdrlen; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); } if (!ext_filter_mask) return NLMSG_GOODSIZE; /* * traverse the list of net devices and compute the minimum * buffer size based upon the filter mask. */ rcu_read_lock(); for_each_netdev_rcu(net, dev) { min_ifinfo_dump_size = max(min_ifinfo_dump_size, if_nlmsg_size(dev, ext_filter_mask)); } rcu_read_unlock(); return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; int type = cb->nlh->nlmsg_type - RTM_BASE; int ret = 0; if (s_idx == 0) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; if (type < 0 || type >= RTM_NR_MSGTYPES) continue; tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); if (!tab) continue; link = rcu_dereference_rtnl(tab[type]); if (!link) continue; dumpit = link->dumpit; if (!dumpit) continue; if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); cb->prev_seq = 0; cb->seq = 0; } ret = dumpit(skb, cb); if (ret) break; } cb->family = idx; return skb->len ? : ret; } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; u32 seq = 0; skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; if (nlmsg_report(nlh)) seq = nlmsg_seq(nlh); else portid = 0; err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, portid, seq, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } return skb; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags); } static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex, u32 portid, const struct nlmsghdr *nlh) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, new_ifindex, portid, nlh); if (skb) rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); } void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL, 0, portid, nlh); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, new_nsid, new_ifindex, 0, NULL); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } /* * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry */ int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (tb[NDA_FLAGS_EXT]) { netdev_info(dev, "invalid flags given to default FDB implementation\n"); return err; } if (vid) { netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); /* Only return duplicate errors if NLM_F_EXCL is set */ if (err == -EEXIST && !(flags & NLM_F_EXCL)) err = 0; return err; } EXPORT_SYMBOL(ndo_dflt_fdb_add); static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } *p_vid = vid; return 0; } static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; u16 vid; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, extack); if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } /* * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry */ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_del); static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct net *net = sock_net(skb->sk); const struct net_device_ops *ops; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; __u8 *addr = NULL; int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!del_bulk) { err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } else { /* For bulk delete, the drivers will parse the message with * policy. */ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); } if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!del_bulk) { if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); return -EINVAL; } err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { /* in case err was cleared by NTF_MASTER call */ err = -EOPNOTSUPP; if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); } if (!err) { if (!del_bulk) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } static int nlmsg_populate_fdb(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, int *idx, struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha; int err; u32 portid, seq; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { if (*idx < cb->args[2]) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: *idx += 1; } return 0; } /** * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @skb: socket buffer to store message in * @cb: netlink callback * @dev: netdevice * @filter_dev: ignored * @idx: the number of FDB table entries dumped is added to *@idx * * Default netdevice operation to dump the existing unicast address list. * Returns number of addresses from list put in skb. */ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { int err; if (dev->type != ARPHRD_ETHER) return -EINVAL; netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, extack); if (err < 0) return err; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_IFINDEX: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); return -EINVAL; } *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); break; case NDA_MASTER: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); return -EINVAL; } *br_idx = nla_get_u32(tb[NDA_MASTER]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); return -EINVAL; } } return 0; } static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err; /* A hack to preserve kernel<->userspace interface. * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. * So, check for ndmsg with an optional u32 attribute (not used here). * Fortunately these sizes don't conflict with the size of ifinfomsg * with an optional attribute. */ if (nlmsg_len(nlh) != sizeof(struct ndmsg) && (nlmsg_len(nlh) != sizeof(struct ndmsg) + nla_attr_size(sizeof(u32)))) { struct ifinfomsg *ifm; err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) { return -EINVAL; } else if (err == 0) { if (tb[IFLA_MASTER]) *br_idx = nla_get_u32(tb[IFLA_MASTER]); } ifm = nlmsg_data(nlh); *brport_idx = ifm->ifi_index; } return 0; } static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; struct net_device *br_dev = NULL; const struct net_device_ops *ops = NULL; const struct net_device_ops *cops = NULL; struct net *net = sock_net(skb->sk); struct hlist_head *head; int brport_idx = 0; int br_idx = 0; int h, s_h; int idx = 0, s_idx; int err = 0; int fidx = 0; if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, cb->extack); else err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, cb->extack); if (err < 0) return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) return -ENODEV; ops = br_dev->netdev_ops; } s_h = cb->args[0]; s_idx = cb->args[1]; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (brport_idx && (dev->ifindex != brport_idx)) continue; if (!br_idx) { /* user did not specify a specific bridge */ if (netif_is_bridge_port(dev)) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; } } else { if (dev != br_dev && !netif_is_bridge_port(dev)) continue; if (br_dev != netdev_master_upper_dev_get(dev) && !netif_is_bridge_master(dev)) continue; cops = ops; } if (idx < s_idx) goto cont; if (netif_is_bridge_port(dev)) { if (cops && cops->ndo_fdb_dump) { err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, &fidx); if (err == -EMSGSIZE) goto out; } } if (dev->netdev_ops->ndo_fdb_dump) err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, &fidx); else err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); if (err == -EMSGSIZE) goto out; cops = NULL; /* reset fdb offset to 0 for rest of the interfaces */ cb->args[2] = 0; fidx = 0; cont: idx++; } } out: cb->args[0] = h; cb->args[1] = idx; cb->args[2] = fidx; return skb->len; } static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct nlattr **tb, u8 *ndm_flags, int *br_idx, int *brport_idx, u8 **addr, u16 *vid, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); return -EINVAL; } if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return err; *ndm_flags = ndm->ndm_flags; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_MASTER: *br_idx = nla_get_u32(tb[i]); break; case NDA_LLADDR: if (nla_len(tb[i]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); return -EINVAL; } *addr = nla_data(tb[i]); break; case NDA_VLAN: err = fdb_vid_parse(tb[i], vid, extack); if (err) return err; break; case NDA_VNI: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); return -EINVAL; } } return 0; } static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net_device *dev = NULL, *br_dev = NULL; const struct net_device_ops *ops = NULL; struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NDA_MAX + 1]; struct sk_buff *skb; int brport_idx = 0; u8 ndm_flags = 0; int br_idx = 0; u8 *addr = NULL; u16 vid = 0; int err; err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, &brport_idx, &addr, &vid, extack); if (err < 0) return err; if (!addr) { NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); return -EINVAL; } if (brport_idx) { dev = __dev_get_by_index(net, brport_idx); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; } } if (br_idx) { if (dev) { NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); return -EINVAL; } br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) { NL_SET_ERR_MSG(extack, "Invalid master ifindex"); return -EINVAL; } ops = br_dev->netdev_ops; } if (dev) { if (!ndm_flags || (ndm_flags & NTF_MASTER)) { if (!netif_is_bridge_port(dev)) { NL_SET_ERR_MSG(extack, "Device is not a bridge port"); return -EINVAL; } br_dev = netdev_master_upper_dev_get(dev); if (!br_dev) { NL_SET_ERR_MSG(extack, "Master of device not found"); return -EINVAL; } ops = br_dev->netdev_ops; } else { if (!(ndm_flags & NTF_SELF)) { NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); return -EINVAL; } ops = dev->netdev_ops; } } if (!br_dev && !dev) { NL_SET_ERR_MSG(extack, "No device specified"); return -ENODEV; } if (!ops || !ops->ndo_fdb_get) { NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); return -EOPNOTSUPP; } skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (br_dev) dev = br_dev; err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); if (err) goto out; return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: kfree_skb(skb); return err; } static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { if (mask & flag) return nla_put_u8(skb, attrnum, !!(flags & flag)); return 0; } int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); int err = 0; nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_BRIDGE; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (br_dev && nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!br_afspec) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } if (mode != BRIDGE_MODE_UNDEF) { if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } if (vlan_fill) { err = vlan_fill(skb, dev, filter_mask); if (err) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protinfo) goto nla_put_failure; if (brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING, BR_LEARNING) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROXYARP, BR_PROXYARP) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { nla_nest_cancel(skb, protinfo); goto nla_put_failure; } nla_nest_end(skb, protinfo); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return err ? err : -EMSGSIZE; } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, bool strict_check, u32 *filter_mask, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err, i; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } if (err < 0) return err; /* new attributes should only be added with strict checking */ for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case IFLA_EXT_MASK: *filter_mask = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); return -EINVAL; } } } return 0; } static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, cb->extack); if (err < 0 && cb->strict_check) return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { const struct net_device_ops *ops = dev->netdev_ops; struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = ops->ndo_bridge_getlink(skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } } err = skb->len; out_err: rcu_read_unlock(); cb->args[0] = idx; return err; } static inline size_t bridge_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */ + nla_total_size(sizeof(u32)) /* IFLA_MTU */ + nla_total_size(sizeof(u32)) /* IFLA_LINK */ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ } static int rtnl_bridge_notify(struct net_device *dev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; if (!dev->netdev_ops->ndo_bridge_getlink) return 0; skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); if (!skb) { err = -ENOMEM; goto errout; } err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); if (err < 0) goto errout; /* Notification info is only filled for bridge ports, not the bridge * device itself. Therefore, a zero notification length is valid and * should not result in an error. */ if (!skb->len) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; errout: WARN_ON(err == -EMSGSIZE); kfree_skb(skb); if (err) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return err; } static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !have_flags) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); } if (nla_type(attr) == IFLA_BRIDGE_MODE) { if (nla_len(attr) < sizeof(u16)) return -EINVAL; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_setlink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); out: return err; } static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); break; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_dellink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); out: return err; } static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) { return (mask & IFLA_STATS_FILTER_BIT(attrid)) && (!idxattr || idxattr == attrid); } static bool rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id) { return dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && dev->netdev_ops->ndo_get_offload_stats && dev->netdev_ops->ndo_has_offload_stats(dev, attr_id); } static unsigned int rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id) { return rtnl_offload_xstats_have_ndo(dev, attr_id) ? sizeof(struct rtnl_link_stats64) : 0; } static int rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id, struct sk_buff *skb) { unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id); struct nlattr *attr = NULL; void *attr_data; int err; if (!size) return -ENODATA; attr = nla_reserve_64bit(skb, attr_id, size, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) return -EMSGSIZE; attr_data = nla_data(attr); memset(attr_data, 0, size); err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); if (err) return err; return 0; } static unsigned int rtnl_offload_xstats_get_size_stats(const struct net_device *dev, enum netdev_offload_xstats_type type) { bool enabled = netdev_offload_xstats_enabled(dev, type); return enabled ? sizeof(struct rtnl_hw_stats64) : 0; } struct rtnl_offload_xstats_request_used { bool request; bool used; }; static int rtnl_offload_xstats_get_stats(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_offload_xstats_request_used *ru, struct rtnl_hw_stats64 *stats, struct netlink_ext_ack *extack) { bool request; bool used; int err; request = netdev_offload_xstats_enabled(dev, type); if (!request) { used = false; goto out; } err = netdev_offload_xstats_get(dev, type, stats, &used, extack); if (err) return err; out: if (ru) { ru->request = request; ru->used = used; } return 0; } static int rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id, struct rtnl_offload_xstats_request_used *ru) { struct nlattr *nest; nest = nla_nest_start(skb, attr_id); if (!nest) return -EMSGSIZE; if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request)) goto nla_put_failure; if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_offload_xstats_request_used ru_l3; struct nlattr *nest; int err; err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack); if (err) return err; nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO); if (!nest) return -EMSGSIZE; if (rtnl_offload_xstats_fill_hw_s_info_one(skb, IFLA_OFFLOAD_XSTATS_L3_STATS, &ru_l3)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev, int *prividx, u32 off_filter_mask, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO; int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS; int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; bool have_data = false; int err; if (*prividx <= attr_id_cpu_hit && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) { err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb); if (!err) { have_data = true; } else if (err != -ENODATA) { *prividx = attr_id_cpu_hit; return err; } } if (*prividx <= attr_id_hw_s_info && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) { *prividx = attr_id_hw_s_info; err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack); if (err) return err; have_data = true; *prividx = 0; } if (*prividx <= attr_id_l3_stats && (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) { unsigned int size_l3; struct nlattr *attr; *prividx = attr_id_l3_stats; size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3); if (!size_l3) goto skip_l3_stats; attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) return -EMSGSIZE; err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL, nla_data(attr), extack); if (err) return err; have_data = true; skip_l3_stats: *prividx = 0; } if (!have_data) return -ENODATA; *prividx = 0; return 0; } static unsigned int rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, enum netdev_offload_xstats_type type) { return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ nla_total_size(sizeof(u8)) + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ nla_total_size(sizeof(u8)) + 0; } static unsigned int rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; return nla_total_size(0) + /* IFLA_OFFLOAD_XSTATS_L3_STATS */ rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) + 0; } static int rtnl_offload_xstats_get_size(const struct net_device *dev, u32 off_filter_mask) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; int nla_size = 0; int size; if (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) { size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit); nla_size += nla_total_size_64bit(size); } if (off_filter_mask & IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO)) nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev); if (off_filter_mask & IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) { size = rtnl_offload_xstats_get_size_stats(dev, t_l3); nla_size += nla_total_size_64bit(size); } if (nla_size != 0) nla_size += nla_total_size(0); return nla_size; } struct rtnl_stats_dump_filters { /* mask[0] filters outer attributes. Then individual nests have their * filtering mask at the index of the nested attribute. */ u32 mask[IFLA_STATS_MAX + 1]; }; static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, const struct rtnl_stats_dump_filters *filters, int *idxattr, int *prividx, struct netlink_ext_ack *extack) { unsigned int filter_mask = filters->mask[0]; struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; int err; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); if (!nlh) return -EMSGSIZE; ifsm = nlmsg_data(nlh); ifsm->family = PF_UNSPEC; ifsm->pad1 = 0; ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { struct rtnl_link_stats64 *sp; attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, sizeof(struct rtnl_link_stats64), IFLA_STATS_UNSPEC); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } sp = nla_data(attr); dev_get_stats(dev, sp); } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, *idxattr)) { const struct rtnl_link_ops *ops = NULL; const struct net_device *master; master = netdev_master_upper_dev_get(dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS_SLAVE); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { u32 off_filter_mask; off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } err = rtnl_offload_xstats_fill(skb, dev, prividx, off_filter_mask, extack); if (err == -ENODATA) nla_nest_cancel(skb, attr); else nla_nest_end(skb, attr); if (err && err != -ENODATA) goto nla_put_failure; *idxattr = 0; } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { struct rtnl_af_ops *af_ops; *idxattr = IFLA_STATS_AF_SPEC; attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); if (!attr) { err = -EMSGSIZE; goto nla_put_failure; } rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) { rcu_read_unlock(); err = -EMSGSIZE; goto nla_put_failure; } err = af_ops->fill_stats_af(skb, dev); if (err == -ENODATA) { nla_nest_cancel(skb, af); } else if (err < 0) { rcu_read_unlock(); goto nla_put_failure; } nla_nest_end(skb, af); } } rcu_read_unlock(); nla_nest_end(skb, attr); *idxattr = 0; } nlmsg_end(skb, nlh); return 0; nla_put_failure: /* not a multi message or no progress mean a real error */ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) nlmsg_cancel(skb, nlh); else nlmsg_end(skb, nlh); return err; } static size_t if_nlmsg_stats_size(const struct net_device *dev, const struct rtnl_stats_dump_filters *filters) { size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); unsigned int filter_mask = filters->mask[0]; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { struct net_device *_dev = (struct net_device *)dev; const struct rtnl_link_ops *ops = NULL; const struct net_device *master; /* netdev_master_upper_dev_get can't take const */ master = netdev_master_upper_dev_get(_dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->get_linkxstats_size) { int attr = IFLA_STATS_LINK_XSTATS_SLAVE; size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS_SLAVE */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) { u32 off_filter_mask; off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; size += rtnl_offload_xstats_get_size(dev, off_filter_mask); } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { struct rtnl_af_ops *af_ops; /* for IFLA_STATS_AF_SPEC */ size += nla_total_size(0); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_stats_af_size) { size += nla_total_size( af_ops->get_stats_af_size(dev)); /* for AF_* */ size += nla_total_size(0); } } rcu_read_unlock(); } return size; } #define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1) static const struct nla_policy rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = { [IFLA_STATS_LINK_OFFLOAD_XSTATS] = NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID), }; static const struct nla_policy rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { [IFLA_STATS_GET_FILTERS] = NLA_POLICY_NESTED(rtnl_stats_get_policy_filters), }; static const struct nla_policy ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, struct rtnl_stats_dump_filters *filters, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_STATS_MAX + 1]; int err; int at; err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters, rtnl_stats_get_policy_filters, extack); if (err < 0) return err; for (at = 1; at <= IFLA_STATS_MAX; at++) { if (tb[at]) { if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) { NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask"); return -EINVAL; } filters->mask[at] = nla_get_u32(tb[at]); } } return 0; } static int rtnl_stats_get_parse(const struct nlmsghdr *nlh, u32 filter_mask, struct rtnl_stats_dump_filters *filters, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; int err; int i; filters->mask[0] = filter_mask; for (i = 1; i < ARRAY_SIZE(filters->mask); i++) filters->mask[i] = -1U; err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb, IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack); if (err < 0) return err; if (tb[IFLA_STATS_GET_FILTERS]) { err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS], filters, extack); if (err) return err; } return 0; } static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, bool is_dump, struct netlink_ext_ack *extack) { struct if_stats_msg *ifsm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } if (!strict_check) return 0; ifsm = nlmsg_data(nlh); /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); return -EINVAL; } if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); return -EINVAL; } return 0; } static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; struct sk_buff *nskb; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); return -EINVAL; } err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack); if (err) return err; nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL); if (!nskb) return -ENOBUFS; err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, &filters, &idxattr, &prividx, extack); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); } return err; } static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; int h, s_h, err, s_idx, s_idxattr, s_prividx; struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; struct hlist_head *head; struct net_device *dev; int idx = 0; s_h = cb->args[0]; s_idx = cb->args[1]; s_idxattr = cb->args[2]; s_prividx = cb->args[3]; cb->seq = net->dev_base_seq; err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); if (err) return err; ifsm = nlmsg_data(cb->nlh); if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; } err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters, extack); if (err) return err; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (idx < s_idx) goto cont; err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, &filters, &s_idxattr, &s_prividx, extack); /* If we ran out of room on the first message, * we're in trouble */ WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); if (err < 0) goto out; s_prividx = 0; s_idxattr = 0; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } out: cb->args[3] = s_prividx; cb->args[2] = s_idxattr; cb->args[1] = idx; cb->args[0] = h; return skb->len; } void rtnl_offload_xstats_notify(struct net_device *dev) { struct rtnl_stats_dump_filters response_filters = {}; struct net *net = dev_net(dev); int idxattr = 0, prividx = 0; struct sk_buff *skb; int err = -ENOBUFS; ASSERT_RTNL(); response_filters.mask[0] |= IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), GFP_KERNEL); if (!skb) goto errout; err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, &response_filters, &idxattr, &prividx, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_STATS, err); } EXPORT_SYMBOL(rtnl_offload_xstats_notify); static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_stats_dump_filters response_filters = {}; struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; struct if_stats_msg *ifsm; bool notify = false; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC"); return -EINVAL; } if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; if (ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set"); return -EINVAL; } err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX, ifla_stats_set_policy, extack); if (err < 0) return err; if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); if (req) err = netdev_offload_xstats_enable(dev, t_l3, extack); else err = netdev_offload_xstats_disable(dev, t_l3); if (!err) notify = true; else if (err != -EALREADY) return err; response_filters.mask[0] |= IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); } if (notify) rtnl_offload_xstats_notify(dev); return 0; } static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct br_port_msg *bpm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } bpm = nlmsg_data(nlh); if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*bpm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); return -EINVAL; } return 0; } struct rtnl_mdb_dump_ctx { long idx; }; static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx, s_idx; int err; NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx); if (cb->strict_check) { err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); if (err) return err; } s_idx = ctx->idx; idx = 0; for_each_netdev(net, dev) { if (idx < s_idx) goto skip; if (!dev->netdev_ops->ndo_mdb_dump) goto skip; err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb); if (err == -EMSGSIZE) goto out; /* Moving on to next device, reset markers and sequence * counters since they are all maintained per-device. */ memset(cb->ctx, 0, sizeof(cb->ctx)); cb->prev_seq = 0; cb->seq = 0; skip: idx++; } out: ctx->idx = idx; return skb->len; } static int rtnl_validate_mdb_entry_get(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->ifindex) { NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified"); return -EINVAL; } if (entry->state) { NL_SET_ERR_MSG(extack, "Entry state cannot be specified"); return -EINVAL; } if (entry->flags) { NL_SET_ERR_MSG(extack, "Entry flags cannot be specified"); return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } if (entry->addr.proto != htons(ETH_P_IP) && entry->addr.proto != htons(ETH_P_IPV6) && entry->addr.proto != 0) { NL_SET_ERR_MSG(extack, "Unknown entry protocol"); return -EINVAL; } return 0; } static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = { [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry_get, sizeof(struct br_mdb_entry)), [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1]; struct net *net = sock_net(in_skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb, MDBA_GET_ENTRY_MAX, mdba_get_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute"); return -EINVAL; } if (!dev->netdev_ops->ndo_mdb_get) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); } static int rtnl_validate_mdb_entry(const struct nlattr *attr, struct netlink_ext_ack *extack) { struct br_mdb_entry *entry = nla_data(attr); if (nla_len(attr) != sizeof(struct br_mdb_entry)) { NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length"); return -EINVAL; } if (entry->ifindex == 0) { NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed"); return -EINVAL; } if (entry->addr.proto == htons(ETH_P_IP)) { if (!ipv4_is_multicast(entry->addr.u.ip4) && !ipv4_is_zeronet(entry->addr.u.ip4)) { NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0"); return -EINVAL; } if (ipv4_is_local_multicast(entry->addr.u.ip4)) { NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast"); return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) } else if (entry->addr.proto == htons(ETH_P_IPV6)) { if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) { NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes"); return -EINVAL; } #endif } else if (entry->addr.proto == 0) { /* L2 mdb */ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) { NL_SET_ERR_MSG(extack, "L2 entry group is not multicast"); return -EINVAL; } } else { NL_SET_ERR_MSG(extack, "Unknown entry protocol"); return -EINVAL; } if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) { NL_SET_ERR_MSG(extack, "Unknown entry state"); return -EINVAL; } if (entry->vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "Invalid entry VLAN id"); return -EINVAL; } return 0; } static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = { [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 }, [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, rtnl_validate_mdb_entry, sizeof(struct br_mdb_entry)), [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED }, }; static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, mdba_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); return -EINVAL; } if (!dev->netdev_ops->ndo_mdb_add) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack); } static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; struct net *net = sock_net(skb->sk); struct br_port_msg *bpm; struct net_device *dev; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, mdba_policy, extack); if (err) return err; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, bpm->ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Device doesn't exist"); return -ENODEV; } if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute"); return -EINVAL; } if (!dev->netdev_ops->ndo_mdb_del) { NL_SET_ERR_MSG(extack, "Device does not support MDB operations"); return -EOPNOTSUPP; } return dev->netdev_ops->ndo_mdb_del(dev, tb, extack); } /* Process one rtnetlink message. */ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtnl_link *link; enum rtnl_kinds kind; struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; int family; int type; type = nlh->nlmsg_type; if (type > RTM_MAX) return -EOPNOTSUPP; type -= RTM_BASE; /* All the messages must have at least 1 byte length */ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg)) return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; kind = rtnl_msgtype_kind(type); if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; rcu_read_lock(); if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) { struct sock *rtnl; rtnl_dumpit_func dumpit; u32 min_dump_alloc = 0; link = rtnl_get_link(family, type); if (!link || !link->dumpit) { family = PF_UNSPEC; link = rtnl_get_link(family, type); if (!link || !link->dumpit) goto err_unlock; } owner = link->owner; dumpit = link->dumpit; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); err = 0; /* need to do this before rcu_read_unlock() */ if (!try_module_get(owner)) err = -EPROTONOSUPPORT; rcu_read_unlock(); rtnl = net->rtnl; if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, .module = owner, }; err = netlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ module_put(owner); } return err; } link = rtnl_get_link(family, type); if (!link || !link->doit) { family = PF_UNSPEC; link = rtnl_get_link(PF_UNSPEC, type); if (!link || !link->doit) goto out_unlock; } owner = link->owner; if (!try_module_get(owner)) { err = -EPROTONOSUPPORT; goto out_unlock; } flags = link->flags; if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) && !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) { NL_SET_ERR_MSG(extack, "Bulk delete is not supported"); module_put(owner); goto err_unlock; } if (flags & RTNL_FLAG_DOIT_UNLOCKED) { doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); module_put(owner); return err; } rcu_read_unlock(); rtnl_lock(); link = rtnl_get_link(family, type); if (link && link->doit) err = link->doit(skb, nlh, extack); rtnl_unlock(); module_put(owner); return err; out_unlock: rcu_read_unlock(); return err; err_unlock: rcu_read_unlock(); return -EOPNOTSUPP; } static void rtnetlink_rcv(struct sk_buff *skb) { netlink_rcv_skb(skb, &rtnetlink_rcv_msg); } static int rtnetlink_bind(struct net *net, int group) { switch (group) { case RTNLGRP_IPV4_MROUTE_R: case RTNLGRP_IPV6_MROUTE_R: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; break; } return 0; } static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REBOOT: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: case NETDEV_POST_TYPE_CHANGE: case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), GFP_KERNEL, NULL, 0, 0, NULL); break; default: break; } return NOTIFY_DONE; } static struct notifier_block rtnetlink_dev_notifier = { .notifier_call = rtnetlink_event, }; static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; return 0; } static void __net_exit rtnetlink_net_exit(struct net *net) { netlink_kernel_release(net->rtnl); net->rtnl = NULL; } static struct pernet_operations rtnetlink_net_ops = { .init = rtnetlink_net_init, .exit = rtnetlink_net_exit, }; void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo, 0); rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, RTNL_FLAG_BULK_DEL_SUPPORTED); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, 0); rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0); rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, 0); }
71 71 71 36 36 28 16 16 16 16 36 36 36 36 8 1 7 1 6 6 6 6 6 6 6 6 8 4 4 4 4 5 5 5 5 4 4 4 14 14 14 14 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: policy rules. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Thomas Graf <tgraf@suug.ch> * * Fixes: * Rani Assaf : local_rule cannot be deleted * Marc Boucher : routing by fwmark */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/inetdevice.h> #include <linux/init.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/fib_rules.h> #include <linux/indirect_call_wrapper.h> struct fib4_rule { struct fib_rule common; u8 dst_len; u8 src_len; dscp_t dscp; __be32 src; __be32 srcmask; __be32 dst; __be32 dstmask; #ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; static bool fib4_rule_matchall(const struct fib_rule *rule) { struct fib4_rule *r = container_of(rule, struct fib4_rule, common); if (r->dst_len || r->src_len || r->dscp) return false; return fib_rule_matchall(rule); } bool fib4_rule_default(const struct fib_rule *rule) { if (!fib4_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || rule->l3mdev) return false; if (rule->table != RT_TABLE_LOCAL && rule->table != RT_TABLE_MAIN && rule->table != RT_TABLE_DEFAULT) return false; return true; } EXPORT_SYMBOL_GPL(fib4_rule_default); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, AF_INET); } int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_lookup_arg arg = { .result = res, .flags = flags, }; int err; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi4_to_flowi(flp)); err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); #ifdef CONFIG_IP_ROUTE_CLASSID if (arg.rule) res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; else res->tclassid = 0; #endif if (err == -ESRCH) err = -ENETUNREACH; return err; } EXPORT_SYMBOL_GPL(__fib_lookup); INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { int err = -EAGAIN; struct fib_table *tbl; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } rcu_read_lock(); tb_id = fib_rule_get_table(rule, arg); tbl = fib_get_table(rule->fr_net, tb_id); if (tbl) err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *)arg->result, arg->flags); rcu_read_unlock(); return err; } INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg) { struct fib_result *result = arg->result; struct net_device *dev = NULL; if (result->fi) { struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0); dev = nhc->nhc_dev; } /* do not accept result if the route does * not meet the required prefix length */ if (result->prefixlen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device * belonging to a forbidden interface group */ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) goto suppress_route; return false; suppress_route: if (!(arg->flags & FIB_LOOKUP_NOREF)) fib_info_put(result->fi); return true; } INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; struct flowi4 *fl4 = &fl->u.ip4; __be32 daddr = fl4->daddr; __be32 saddr = fl4->saddr; if (((saddr ^ r->src) & r->srcmask) || ((daddr ^ r->dst) & r->dstmask)) return 0; if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) return 0; if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) return 0; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) return 0; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) return 0; return 1; } static struct fib_table *fib_empty_table(struct net *net) { u32 id = 1; while (1) { if (!fib_get_table(net, id)) return fib_new_table(net, id); if (id++ == RT_TABLE_MAX) break; } return NULL; } static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); goto errout; } /* IPv4 currently doesn't handle high order DSCP bits correctly */ if (frh->tos & ~IPTOS_TOS_MASK) { NL_SET_ERR_MSG(extack, "Invalid tos"); goto errout; } rule4->dscp = inet_dsfield_to_dscp(frh->tos); /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) goto errout; if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; table = fib_empty_table(net); if (!table) { err = -ENOBUFS; goto errout; } rule->table = table->tb_id; } } if (frh->src_len) rule4->src = nla_get_in_addr(tb[FRA_SRC]); if (frh->dst_len) rule4->dst = nla_get_in_addr(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); if (rule4->tclassid) atomic_inc(&net->ipv4.fib_num_tclassid_users); } #endif if (fib_rule_requires_fldissect(rule)) net->ipv4.fib_rules_require_fldissect++; rule4->src_len = frh->src_len; rule4->srcmask = inet_make_mask(rule4->src_len); rule4->dst_len = frh->dst_len; rule4->dstmask = inet_make_mask(rule4->dst_len); net->ipv4.fib_has_custom_rules = true; err = 0; errout: return err; } static int fib4_rule_delete(struct fib_rule *rule) { struct net *net = rule->fr_net; int err; /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) goto errout; #ifdef CONFIG_IP_ROUTE_CLASSID if (((struct fib4_rule *)rule)->tclassid) atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif net->ipv4.fib_has_custom_rules = true; if (net->ipv4.fib_rules_require_fldissect && fib_rule_requires_fldissect(rule)) net->ipv4.fib_rules_require_fldissect--; errout: return err; } static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (frh->src_len && (rule4->src_len != frh->src_len)) return 0; if (frh->dst_len && (rule4->dst_len != frh->dst_len)) return 0; if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos) return 0; #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC]))) return 0; if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST]))) return 0; return 1; } static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { struct fib4_rule *rule4 = (struct fib4_rule *) rule; frh->dst_len = rule4->dst_len; frh->src_len = rule4->src_len; frh->tos = inet_dscp_to_dsfield(rule4->dscp); if ((rule4->dst_len && nla_put_in_addr(skb, FRA_DST, rule4->dst)) || (rule4->src_len && nla_put_in_addr(skb, FRA_SRC, rule4->src))) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid && nla_put_u32(skb, FRA_FLOW, rule4->tclassid)) goto nla_put_failure; #endif return 0; nla_put_failure: return -ENOBUFS; } static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(4) /* dst */ + nla_total_size(4) /* src */ + nla_total_size(4); /* flow */ } static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { rt_cache_flush(ops->fro_net); } static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), .action = fib4_rule_action, .suppress = fib4_rule_suppress, .match = fib4_rule_match, .configure = fib4_rule_configure, .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .owner = THIS_MODULE, }; static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0); if (err < 0) return err; return 0; } int __net_init fib4_rules_init(struct net *net) { int err; struct fib_rules_ops *ops; ops = fib_rules_register(&fib4_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); err = fib_default_rules_init(ops); if (err < 0) goto fail; net->ipv4.rules_ops = ops; net->ipv4.fib_has_custom_rules = false; net->ipv4.fib_rules_require_fldissect = 0; return 0; fail: /* also cleans all rules already added */ fib_rules_unregister(ops); return err; } void __net_exit fib4_rules_exit(struct net *net) { fib_rules_unregister(net->ipv4.rules_ops); }
546 4 2769 2 2738 2738 2738 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H #include "elevator.h" #include "blk-mq.h" #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) __blk_mq_sched_restart(hctx); } static inline bool bio_mergeable(struct bio *bio) { return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = rq->q->elevator; if (e->type->ops.completed_request) e->type->ops.completed_request(rq, now); } } static inline void blk_mq_sched_requeue_request(struct request *rq) { if (rq->rq_flags & RQF_USE_SCHED) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if (e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; if (e && e->type->ops.has_work) return e->type->ops.has_work(hctx); return false; } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } #endif
88 99 47 5 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 /* SPDX-License-Identifier: GPL-2.0 */ /* * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2018-2022 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H #include <linux/list.h> #include <linux/netdevice.h> #include <linux/rbtree.h> #include <linux/debugfs.h> #include <linux/rfkill.h> #include <linux/workqueue.h> #include <linux/rtnetlink.h> #include <net/genetlink.h> #include <net/cfg80211.h> #include "reg.h" #define WIPHY_IDX_INVALID -1 struct cfg80211_registered_device { const struct cfg80211_ops *ops; struct list_head list; /* rfkill support */ struct rfkill_ops rfkill_ops; struct work_struct rfkill_block; /* ISO / IEC 3166 alpha2 for which this device is receiving * country IEs on, this can help disregard country IEs from APs * on the same alpha2 quickly. The alpha2 may differ from * cfg80211_regdomain's alpha2 when an intersection has occurred. * If the AP is reconfigured this can also be used to tell us if * the country on the country IE changed. */ char country_ie_alpha2[2]; /* * the driver requests the regulatory core to set this regulatory * domain as the wiphy's. Only used for %REGULATORY_WIPHY_SELF_MANAGED * devices using the regulatory_set_wiphy_regd() API */ const struct ieee80211_regdomain *requested_regd; /* If a Country IE has been received this tells us the environment * which its telling us its in. This defaults to ENVIRON_ANY */ enum environment_cap env; /* wiphy index, internal only */ int wiphy_idx; /* protected by RTNL */ int devlist_generation, wdev_id; int opencount; wait_queue_head_t dev_wait; struct list_head beacon_registrations; spinlock_t beacon_registrations_lock; /* protected by RTNL only */ int num_running_ifaces; int num_running_monitor_ifaces; u64 cookie_counter; /* BSSes/scanning */ spinlock_t bss_lock; struct list_head bss_list; struct rb_root bss_tree; u32 bss_generation; u32 bss_entries; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ struct cfg80211_scan_request *int_scan_req; struct sk_buff *scan_msg; struct list_head sched_scan_req_list; time64_t suspend_at; struct wiphy_work scan_done_wk; struct genl_info *cur_cmd_info; struct work_struct conn_work; struct work_struct event_work; struct delayed_work dfs_update_channels_wk; struct wireless_dev *background_radar_wdev; struct cfg80211_chan_def background_radar_chandef; struct delayed_work background_cac_done_wk; struct work_struct background_cac_abort_wk; /* netlink port which started critical protocol (0 means not started) */ u32 crit_proto_nlportid; struct cfg80211_coalesce *coalesce; struct work_struct destroy_work; struct wiphy_work sched_scan_stop_wk; struct work_struct sched_scan_res_wk; struct cfg80211_chan_def radar_chandef; struct work_struct propagate_radar_detect_wk; struct cfg80211_chan_def cac_done_chandef; struct work_struct propagate_cac_done_wk; struct work_struct mgmt_registrations_update_wk; /* lock for all wdev lists */ spinlock_t mgmt_registrations_lock; struct work_struct wiphy_work; struct list_head wiphy_work_list; /* protects the list above */ spinlock_t wiphy_work_lock; bool suspended; /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); }; static inline struct cfg80211_registered_device *wiphy_to_rdev(struct wiphy *wiphy) { BUG_ON(!wiphy); return container_of(wiphy, struct cfg80211_registered_device, wiphy); } static inline void cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev) { #ifdef CONFIG_PM int i; if (!rdev->wiphy.wowlan_config) return; for (i = 0; i < rdev->wiphy.wowlan_config->n_patterns; i++) kfree(rdev->wiphy.wowlan_config->patterns[i].mask); kfree(rdev->wiphy.wowlan_config->patterns); if (rdev->wiphy.wowlan_config->tcp && rdev->wiphy.wowlan_config->tcp->sock) sock_release(rdev->wiphy.wowlan_config->tcp->sock); kfree(rdev->wiphy.wowlan_config->tcp); kfree(rdev->wiphy.wowlan_config->nd_config); kfree(rdev->wiphy.wowlan_config); #endif } static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev) { u64 r = ++rdev->cookie_counter; if (WARN_ON(r == 0)) r = ++rdev->cookie_counter; return r; } extern struct workqueue_struct *cfg80211_wq; extern struct list_head cfg80211_rdev_list; extern int cfg80211_rdev_list_generation; /* This is constructed like this so it can be used in if/else */ static inline int for_each_rdev_check_rtnl(void) { ASSERT_RTNL(); return 0; } #define for_each_rdev(rdev) \ if (for_each_rdev_check_rtnl()) {} else \ list_for_each_entry(rdev, &cfg80211_rdev_list, list) struct cfg80211_internal_bss { struct list_head list; struct list_head hidden_list; struct rb_node rbn; u64 ts_boottime; unsigned long ts; unsigned long refcount; atomic_t hold; /* time at the start of the reception of the first octet of the * timestamp field of the last beacon/probe received for this BSS. * The time is the TSF of the BSS specified by %parent_bssid. */ u64 parent_tsf; /* the BSS according to which %parent_tsf is set. This is set to * the BSS that the interface that requested the scan was connected to * when the beacon/probe was received. */ u8 parent_bssid[ETH_ALEN] __aligned(2); /* must be last because of priv member */ struct cfg80211_bss pub; }; static inline struct cfg80211_internal_bss *bss_from_pub(struct cfg80211_bss *pub) { return container_of(pub, struct cfg80211_internal_bss, pub); } static inline void cfg80211_hold_bss(struct cfg80211_internal_bss *bss) { atomic_inc(&bss->hold); if (bss->pub.transmitted_bss) { bss = container_of(bss->pub.transmitted_bss, struct cfg80211_internal_bss, pub); atomic_inc(&bss->hold); } } static inline void cfg80211_unhold_bss(struct cfg80211_internal_bss *bss) { int r = atomic_dec_return(&bss->hold); WARN_ON(r < 0); if (bss->pub.transmitted_bss) { bss = container_of(bss->pub.transmitted_bss, struct cfg80211_internal_bss, pub); r = atomic_dec_return(&bss->hold); WARN_ON(r < 0); } } struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx); int get_wiphy_idx(struct wiphy *wiphy); struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx); int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, struct net *net); void cfg80211_init_wdev(struct wireless_dev *wdev); void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev) { lockdep_assert_held(&rdev->wiphy.mtx); return rdev->num_running_ifaces == rdev->num_running_monitor_ifaces && rdev->num_running_ifaces > 0; } enum cfg80211_event_type { EVENT_CONNECT_RESULT, EVENT_ROAMED, EVENT_DISCONNECTED, EVENT_IBSS_JOINED, EVENT_STOPPED, EVENT_PORT_AUTHORIZED, }; struct cfg80211_event { struct list_head list; enum cfg80211_event_type type; union { struct cfg80211_connect_resp_params cr; struct cfg80211_roam_info rm; struct { const u8 *ie; size_t ie_len; u16 reason; bool locally_generated; } dc; struct { u8 bssid[ETH_ALEN]; struct ieee80211_channel *channel; } ij; struct { u8 peer_addr[ETH_ALEN]; const u8 *td_bitmap; u8 td_bitmap_len; } pa; }; }; struct cfg80211_cached_keys { struct key_params params[4]; u8 data[4][WLAN_KEY_LEN_WEP104]; int def; }; struct cfg80211_beacon_registration { struct list_head list; u32 nlportid; }; struct cfg80211_cqm_config { struct rcu_head rcu_head; u32 rssi_hyst; s32 last_rssi_event_value; enum nl80211_cqm_rssi_threshold_event last_rssi_event_type; bool use_range_api; int n_rssi_thresholds; s32 rssi_thresholds[] __counted_by(n_rssi_thresholds); }; void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy, struct wiphy_work *work); void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); /* free object */ void cfg80211_dev_free(struct cfg80211_registered_device *rdev); int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname); void ieee80211_set_bitrate_flags(struct wiphy *wiphy); void cfg80211_bss_expire(struct cfg80211_registered_device *rdev); void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs); void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, unsigned int link, struct ieee80211_channel *channel); /* IBSS */ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params, struct cfg80211_cached_keys *connkeys); void cfg80211_clear_ibss(struct net_device *dev, bool nowext); int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext); void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel); int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); /* mesh */ extern const struct mesh_config default_mesh_config; extern const struct mesh_setup default_mesh_setup; int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_setup *setup, const struct mesh_config *conf); int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef); /* OCB */ int cfg80211_join_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ocb_setup *setup); int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev); /* AP */ int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, int link, bool notify); /* MLME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req); int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req); int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, bool local_state_change); int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *ap_addr, const u8 *ie, int ie_len, u16 reason, bool local_state_change); void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, int match_len, bool multicast_rx, struct netlink_ext_ack *extack); void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie); void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, const struct ieee80211_ht_cap *ht_capa_mask); void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, const struct ieee80211_vht_cap *vht_capa_mask); /* SME events */ int cfg80211_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *connect, struct cfg80211_cached_keys *connkeys, const u8 *prev_bssid); void __cfg80211_connect_result(struct net_device *dev, struct cfg80211_connect_resp_params *params, bool wextev); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap); int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev); void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info); void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr, const u8 *td_bitmap, u8 td_bitmap_len); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_autodisconnect_wk(struct work_struct *work); /* SME implementation */ void cfg80211_conn_work(struct work_struct *work); void cfg80211_sme_scan_done(struct net_device *dev); bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status); void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len); void cfg80211_sme_disassoc(struct wireless_dev *wdev); void cfg80211_sme_deauth(struct wireless_dev *wdev); void cfg80211_sme_auth_timeout(struct wireless_dev *wdev); void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev); void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev); /* internal helpers */ bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher); bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int key_idx, bool pairwise); int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr); void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk); void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message); void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req); int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, bool want_multi); void cfg80211_sched_scan_results_wk(struct work_struct *work); int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req, bool driver_initiated); int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, u64 reqid, bool driver_initiated); void cfg80211_upload_connect_keys(struct wireless_dev *wdev); int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params); void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev, struct wiphy_work *end); void cfg80211_process_wdev_events(struct wireless_dev *wdev); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz); int cfg80211_scan(struct cfg80211_registered_device *rdev); extern struct work_struct cfg80211_disconnect_work; void cfg80211_set_dfs_state(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, enum nl80211_dfs_state dfs_state); void cfg80211_dfs_channels_update_work(struct work_struct *work); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev); int cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef); void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev); void cfg80211_background_cac_done_wk(struct work_struct *work); void cfg80211_background_cac_abort_wk(struct work_struct *work); bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, struct ieee80211_channel *chan); bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev); bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, struct ieee80211_channel *chan, bool primary_only); bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev, struct ieee80211_channel *chan, bool primary_only); static inline unsigned int elapsed_jiffies_msecs(unsigned long start) { unsigned long end = jiffies; if (end >= start) return jiffies_to_msecs(end - start); return jiffies_to_msecs(end + (ULONG_MAX - start) + 1); } int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask); int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, u32 beacon_int); void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num); void cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, bool signal_valid, unsigned long ts); #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) #else /* * Trick to enable using it as a condition, * and also not give a warning when it's * not used that way. */ #define CFG80211_DEV_WARN_ON(cond) ({bool __r = (cond); __r; }) #endif void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid); void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev); void cfg80211_pmsr_free_wk(struct work_struct *work); void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id); void cfg80211_remove_links(struct wireless_dev *wdev); int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask); #endif /* __NET_WIRELESS_CORE_H */
95 95 1447 1447 310 368 1 310 310 504 503 310 310 365 365 365 364 82 58 1 365 365 1 365 58 47 47 25 25 1 1 1 10 10 1 1 1 1 53 265 265 264 264 3 57 57 57 57 57 55 55 55 55 55 55 55 186 51 51 127 23 64 53 64 53 53 53 53 4 51 51 49 25 25 25 26 26 26 1 25 25 25 25 504 285 256 227 95 95 1468 1446 994 758 95 2 53 9 9 9 82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "hard-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include "bat_v.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "log.h" #include "originator.h" #include "send.h" #include "soft-interface.h" #include "translation-table.h" /** * batadv_hardif_release() - release hard interface from lists and queue for * free after rcu grace period * @ref: kref pointer of the hard interface */ void batadv_hardif_release(struct kref *ref) { struct batadv_hard_iface *hard_iface; hard_iface = container_of(ref, struct batadv_hard_iface, refcount); dev_put(hard_iface->net_dev); kfree_rcu(hard_iface, rcu); } /** * batadv_hardif_get_by_netdev() - Get hard interface object of a net_device * @net_dev: net_device to search for * * Return: batadv_hard_iface of net_dev (with increased refcnt), NULL on errors */ struct batadv_hard_iface * batadv_hardif_get_by_netdev(const struct net_device *net_dev) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->net_dev == net_dev && kref_get_unless_zero(&hard_iface->refcount)) goto out; } hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } /** * batadv_getlink_net() - return link net namespace (of use fallback) * @netdev: net_device to check * @fallback_net: return in case get_link_net is not available for @netdev * * Return: result of rtnl_link_ops->get_link_net or @fallback_net */ static struct net *batadv_getlink_net(const struct net_device *netdev, struct net *fallback_net) { if (!netdev->rtnl_link_ops) return fallback_net; if (!netdev->rtnl_link_ops->get_link_net) return fallback_net; return netdev->rtnl_link_ops->get_link_net(netdev); } /** * batadv_mutual_parents() - check if two devices are each others parent * @dev1: 1st net dev * @net1: 1st devices netns * @dev2: 2nd net dev * @net2: 2nd devices netns * * veth devices come in pairs and each is the parent of the other! * * Return: true if the devices are each others parent, otherwise false */ static bool batadv_mutual_parents(const struct net_device *dev1, struct net *net1, const struct net_device *dev2, struct net *net2) { int dev1_parent_iflink = dev_get_iflink(dev1); int dev2_parent_iflink = dev_get_iflink(dev2); const struct net *dev1_parent_net; const struct net *dev2_parent_net; dev1_parent_net = batadv_getlink_net(dev1, net1); dev2_parent_net = batadv_getlink_net(dev2, net2); if (!dev1_parent_iflink || !dev2_parent_iflink) return false; return (dev1_parent_iflink == dev2->ifindex) && (dev2_parent_iflink == dev1->ifindex) && net_eq(dev1_parent_net, net2) && net_eq(dev2_parent_net, net1); } /** * batadv_is_on_batman_iface() - check if a device is a batman iface descendant * @net_dev: the device to check * * If the user creates any virtual device on top of a batman-adv interface, it * is important to prevent this new interface from being used to create a new * mesh network (this behaviour would lead to a batman-over-batman * configuration). This function recursively checks all the fathers of the * device passed as argument looking for a batman-adv soft interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) { struct net *net = dev_net(net_dev); struct net_device *parent_dev; struct net *parent_net; int iflink; bool ret; /* check if this is a batman-adv mesh interface */ if (batadv_softif_is_valid(net_dev)) return true; iflink = dev_get_iflink(net_dev); if (iflink == 0) return false; parent_net = batadv_getlink_net(net_dev, net); /* iflink to itself, most likely physical device */ if (net == parent_net && iflink == net_dev->ifindex) return false; /* recurse over the parent device */ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink); if (!parent_dev) { pr_warn("Cannot find parent device. Skipping batadv-on-batadv check for %s\n", net_dev->name); return false; } if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net)) return false; ret = batadv_is_on_batman_iface(parent_dev); return ret; } static bool batadv_is_valid_iface(const struct net_device *net_dev) { if (net_dev->flags & IFF_LOOPBACK) return false; if (net_dev->type != ARPHRD_ETHER) return false; if (net_dev->addr_len != ETH_ALEN) return false; /* no batman over batman */ if (batadv_is_on_batman_iface(net_dev)) return false; return true; } /** * batadv_get_real_netdevice() - check if the given netdev struct is a virtual * interface on top of another 'real' interface * @netdev: the device to check * * Callers must hold the rtnl semaphore. You may want batadv_get_real_netdev() * instead of this. * * Return: the 'real' net device or the original net device and NULL in case * of an error. */ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *real_netdev = NULL; struct net *real_net; struct net *net; int iflink; ASSERT_RTNL(); if (!netdev) return NULL; iflink = dev_get_iflink(netdev); if (iflink == 0) { dev_hold(netdev); return netdev; } hard_iface = batadv_hardif_get_by_netdev(netdev); if (!hard_iface || !hard_iface->soft_iface) goto out; net = dev_net(hard_iface->soft_iface); real_net = batadv_getlink_net(netdev, net); /* iflink to itself, most likely physical device */ if (net == real_net && netdev->ifindex == iflink) { real_netdev = netdev; dev_hold(real_netdev); goto out; } real_netdev = dev_get_by_index(real_net, iflink); out: batadv_hardif_put(hard_iface); return real_netdev; } /** * batadv_get_real_netdev() - check if the given net_device struct is a virtual * interface on top of another 'real' interface * @net_device: the device to check * * Return: the 'real' net device or the original net device and NULL in case * of an error. */ struct net_device *batadv_get_real_netdev(struct net_device *net_device) { struct net_device *real_netdev; rtnl_lock(); real_netdev = batadv_get_real_netdevice(net_device); rtnl_unlock(); return real_netdev; } /** * batadv_is_wext_netdev() - check if the given net_device struct is a * wext wifi interface * @net_device: the device to check * * Return: true if the net device is a wext wireless device, false * otherwise. */ static bool batadv_is_wext_netdev(struct net_device *net_device) { if (!net_device) return false; #ifdef CONFIG_WIRELESS_EXT /* pre-cfg80211 drivers have to implement WEXT, so it is possible to * check for wireless_handlers != NULL */ if (net_device->wireless_handlers) return true; #endif return false; } /** * batadv_is_cfg80211_netdev() - check if the given net_device struct is a * cfg80211 wifi interface * @net_device: the device to check * * Return: true if the net device is a cfg80211 wireless device, false * otherwise. */ static bool batadv_is_cfg80211_netdev(struct net_device *net_device) { if (!net_device) return false; #if IS_ENABLED(CONFIG_CFG80211) /* cfg80211 drivers have to set ieee80211_ptr */ if (net_device->ieee80211_ptr) return true; #endif return false; } /** * batadv_wifi_flags_evaluate() - calculate wifi flags for net_device * @net_device: the device to check * * Return: batadv_hard_iface_wifi_flags flags of the device */ static u32 batadv_wifi_flags_evaluate(struct net_device *net_device) { u32 wifi_flags = 0; struct net_device *real_netdev; if (batadv_is_wext_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_WEXT_DIRECT; if (batadv_is_cfg80211_netdev(net_device)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; real_netdev = batadv_get_real_netdevice(net_device); if (!real_netdev) return wifi_flags; if (real_netdev == net_device) goto out; if (batadv_is_wext_netdev(real_netdev)) wifi_flags |= BATADV_HARDIF_WIFI_WEXT_INDIRECT; if (batadv_is_cfg80211_netdev(real_netdev)) wifi_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT; out: dev_put(real_netdev); return wifi_flags; } /** * batadv_is_cfg80211_hardif() - check if the given hardif is a cfg80211 wifi * interface * @hard_iface: the device to check * * Return: true if the net device is a cfg80211 wireless device, false * otherwise. */ bool batadv_is_cfg80211_hardif(struct batadv_hard_iface *hard_iface) { u32 allowed_flags = 0; allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_DIRECT; allowed_flags |= BATADV_HARDIF_WIFI_CFG80211_INDIRECT; return !!(hard_iface->wifi_flags & allowed_flags); } /** * batadv_is_wifi_hardif() - check if the given hardif is a wifi interface * @hard_iface: the device to check * * Return: true if the net device is a 802.11 wireless device, false otherwise. */ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface) { if (!hard_iface) return false; return hard_iface->wifi_flags != 0; } /** * batadv_hardif_no_broadcast() - check whether (re)broadcast is necessary * @if_outgoing: the outgoing interface checked and considered for (re)broadcast * @orig_addr: the originator of this packet * @orig_neigh: originator address of the forwarder we just got the packet from * (NULL if we originated) * * Checks whether a packet needs to be (re)broadcasted on the given interface. * * Return: * BATADV_HARDIF_BCAST_NORECIPIENT: No neighbor on interface * BATADV_HARDIF_BCAST_DUPFWD: Just one neighbor, but it is the forwarder * BATADV_HARDIF_BCAST_DUPORIG: Just one neighbor, but it is the originator * BATADV_HARDIF_BCAST_OK: Several neighbors, must broadcast */ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, u8 *orig_addr, u8 *orig_neigh) { struct batadv_hardif_neigh_node *hardif_neigh; struct hlist_node *first; int ret = BATADV_HARDIF_BCAST_OK; rcu_read_lock(); /* 0 neighbors -> no (re)broadcast */ first = rcu_dereference(hlist_first_rcu(&if_outgoing->neigh_list)); if (!first) { ret = BATADV_HARDIF_BCAST_NORECIPIENT; goto out; } /* >1 neighbors -> (re)broadcast */ if (rcu_dereference(hlist_next_rcu(first))) goto out; hardif_neigh = hlist_entry(first, struct batadv_hardif_neigh_node, list); /* 1 neighbor, is the originator -> no rebroadcast */ if (orig_addr && batadv_compare_eth(hardif_neigh->orig, orig_addr)) { ret = BATADV_HARDIF_BCAST_DUPORIG; /* 1 neighbor, is the one we received from -> no rebroadcast */ } else if (orig_neigh && batadv_compare_eth(hardif_neigh->orig, orig_neigh)) { ret = BATADV_HARDIF_BCAST_DUPFWD; } out: rcu_read_unlock(); return ret; } static struct batadv_hard_iface * batadv_hardif_get_active(const struct net_device *soft_iface) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; if (hard_iface->if_status == BATADV_IF_ACTIVE && kref_get_unless_zero(&hard_iface->refcount)) goto out; } hard_iface = NULL; out: rcu_read_unlock(); return hard_iface; } static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, struct batadv_hard_iface *oldif) { struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; batadv_dat_init_own_addr(bat_priv, primary_if); batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, struct batadv_hard_iface *new_hard_iface) { struct batadv_hard_iface *curr_hard_iface; ASSERT_RTNL(); if (new_hard_iface) kref_get(&new_hard_iface->refcount); curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if, new_hard_iface, 1); if (!new_hard_iface) goto out; bat_priv->algo_ops->iface.primary_set(new_hard_iface); batadv_primary_if_update_addr(bat_priv, curr_hard_iface); out: batadv_hardif_put(curr_hard_iface); } static bool batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface) { if (hard_iface->net_dev->flags & IFF_UP) return true; return false; } static void batadv_check_known_mac_addr(const struct net_device *net_dev) { const struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->net_dev == net_dev) continue; if (!batadv_compare_eth(hard_iface->net_dev->dev_addr, net_dev->dev_addr)) continue; pr_warn("The newly added mac address (%pM) already exists on: %s\n", net_dev->dev_addr, hard_iface->net_dev->name); pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n"); } rcu_read_unlock(); } /** * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom * @soft_iface: netdev struct of the mesh interface */ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) { const struct batadv_hard_iface *hard_iface; unsigned short lower_header_len = ETH_HLEN; unsigned short lower_headroom = 0; unsigned short lower_tailroom = 0; unsigned short needed_headroom; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; if (hard_iface->soft_iface != soft_iface) continue; lower_header_len = max_t(unsigned short, lower_header_len, hard_iface->net_dev->hard_header_len); lower_headroom = max_t(unsigned short, lower_headroom, hard_iface->net_dev->needed_headroom); lower_tailroom = max_t(unsigned short, lower_tailroom, hard_iface->net_dev->needed_tailroom); } rcu_read_unlock(); needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); needed_headroom += batadv_max_header_len(); /* fragmentation headers don't strip the unicast/... header */ needed_headroom += sizeof(struct batadv_frag_packet); soft_iface->needed_headroom = needed_headroom; soft_iface->needed_tailroom = lower_tailroom; } /** * batadv_hardif_min_mtu() - Calculate maximum MTU for soft interface * @soft_iface: netdev struct of the soft interface * * Return: MTU for the soft-interface (limited by the minimal MTU of all active * slave interfaces) */ int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); const struct batadv_hard_iface *hard_iface; int min_mtu = INT_MAX; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->soft_iface != soft_iface) continue; min_mtu = min_t(int, hard_iface->net_dev->mtu, min_mtu); } rcu_read_unlock(); if (atomic_read(&bat_priv->fragmentation) == 0) goto out; /* with fragmentation enabled the maximum size of internally generated * packets such as translation table exchanges or tvlv containers, etc * has to be calculated */ min_mtu = min_t(int, min_mtu, BATADV_FRAG_MAX_FRAG_SIZE); min_mtu -= sizeof(struct batadv_frag_packet); min_mtu *= BATADV_FRAG_MAX_FRAGMENTS; out: /* report to the other components the maximum amount of bytes that * batman-adv can send over the wire (without considering the payload * overhead). For example, this value is used by TT to compute the * maximum local table size */ atomic_set(&bat_priv->packet_size_max, min_mtu); /* the real soft-interface MTU is computed by removing the payload * overhead from the maximum amount of bytes that was just computed. * * However batman-adv does not support MTUs bigger than ETH_DATA_LEN */ return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN); } /** * batadv_update_min_mtu() - Adjusts the MTU if a new interface with a smaller * MTU appeared * @soft_iface: netdev struct of the soft interface */ void batadv_update_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); int limit_mtu; int mtu; mtu = batadv_hardif_min_mtu(soft_iface); if (bat_priv->mtu_set_by_user) limit_mtu = bat_priv->mtu_set_by_user; else limit_mtu = ETH_DATA_LEN; mtu = min(mtu, limit_mtu); dev_set_mtu(soft_iface, mtu); /* Check if the local translate table should be cleaned up to match a * new (and smaller) MTU. */ batadv_tt_local_resize_to_mtu(soft_iface); } static void batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; bat_priv = netdev_priv(hard_iface->soft_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED; /* the first active interface becomes our primary interface or * the next active interface after the old primary interface was removed */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) batadv_primary_if_select(bat_priv, hard_iface); batadv_info(hard_iface->soft_iface, "Interface activated: %s\n", hard_iface->net_dev->name); batadv_update_min_mtu(hard_iface->soft_iface); if (bat_priv->algo_ops->iface.activate) bat_priv->algo_ops->iface.activate(hard_iface); out: batadv_hardif_put(primary_if); } static void batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) return; hard_iface->if_status = BATADV_IF_INACTIVE; batadv_info(hard_iface->soft_iface, "Interface deactivated: %s\n", hard_iface->net_dev->name); batadv_update_min_mtu(hard_iface->soft_iface); } /** * batadv_hardif_enable_interface() - Enslave hard interface to soft interface * @hard_iface: hard interface to add to soft interface * @soft_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, struct net_device *soft_iface) { struct batadv_priv *bat_priv; __be16 ethertype = htons(ETH_P_BATMAN); int max_header_len = batadv_max_header_len(); unsigned int required_mtu; unsigned int hardif_mtu; int ret; hardif_mtu = READ_ONCE(hard_iface->net_dev->mtu); required_mtu = READ_ONCE(soft_iface->mtu) + max_header_len; if (hardif_mtu < ETH_MIN_MTU + max_header_len) return -EINVAL; if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; kref_get(&hard_iface->refcount); dev_hold(soft_iface); hard_iface->soft_iface = soft_iface; bat_priv = netdev_priv(hard_iface->soft_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface, NULL, NULL, NULL); if (ret) goto err_dev; ret = bat_priv->algo_ops->iface.enable(hard_iface); if (ret < 0) goto err_upper; hard_iface->if_status = BATADV_IF_INACTIVE; kref_get(&hard_iface->refcount); hard_iface->batman_adv_ptype.type = ethertype; hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv; hard_iface->batman_adv_ptype.dev = hard_iface->net_dev; dev_add_pack(&hard_iface->batman_adv_ptype); batadv_info(hard_iface->soft_iface, "Adding interface: %s\n", hard_iface->net_dev->name); if (atomic_read(&bat_priv->fragmentation) && hardif_mtu < required_mtu) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n", hard_iface->net_dev->name, hardif_mtu, required_mtu); if (!atomic_read(&bat_priv->fragmentation) && hardif_mtu < required_mtu) batadv_info(hard_iface->soft_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n", hard_iface->net_dev->name, hardif_mtu, required_mtu); if (batadv_hardif_is_iface_up(hard_iface)) batadv_hardif_activate_interface(hard_iface); else batadv_err(hard_iface->soft_iface, "Not using interface %s (retrying later): interface not active\n", hard_iface->net_dev->name); batadv_hardif_recalc_extra_skbroom(soft_iface); if (bat_priv->algo_ops->iface.enabled) bat_priv->algo_ops->iface.enabled(hard_iface); out: return 0; err_upper: netdev_upper_dev_unlink(hard_iface->net_dev, soft_iface); err_dev: hard_iface->soft_iface = NULL; dev_put(soft_iface); batadv_hardif_put(hard_iface); return ret; } /** * batadv_hardif_cnt() - get number of interfaces enslaved to soft interface * @soft_iface: soft interface to check * * This function is only using RCU for locking - the result can therefore be * off when another function is modifying the list at the same time. The * caller can use the rtnl_lock to make sure that the count is accurate. * * Return: number of connected/enslaved hard interfaces */ static size_t batadv_hardif_cnt(const struct net_device *soft_iface) { struct batadv_hard_iface *hard_iface; size_t count = 0; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != soft_iface) continue; count++; } rcu_read_unlock(); return count; } /** * batadv_hardif_disable_interface() - Remove hard interface from soft interface * @hard_iface: hard interface to be removed */ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if = NULL; batadv_hardif_deactivate_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; batadv_info(hard_iface->soft_iface, "Removing interface: %s\n", hard_iface->net_dev->name); dev_remove_pack(&hard_iface->batman_adv_ptype); batadv_hardif_put(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { struct batadv_hard_iface *new_if; new_if = batadv_hardif_get_active(hard_iface->soft_iface); batadv_primary_if_select(bat_priv, new_if); batadv_hardif_put(new_if); } bat_priv->algo_ops->iface.disable(hard_iface); hard_iface->if_status = BATADV_IF_NOT_IN_USE; /* delete all references to this hard_iface */ batadv_purge_orig_ref(bat_priv); batadv_purge_outstanding_packets(bat_priv, hard_iface); dev_put(hard_iface->soft_iface); netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) batadv_gw_check_client_stop(bat_priv); hard_iface->soft_iface = NULL; batadv_hardif_put(hard_iface); out: batadv_hardif_put(primary_if); } static struct batadv_hard_iface * batadv_hardif_add_interface(struct net_device *net_dev) { struct batadv_hard_iface *hard_iface; ASSERT_RTNL(); if (!batadv_is_valid_iface(net_dev)) goto out; dev_hold(net_dev); hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC); if (!hard_iface) goto release_dev; hard_iface->net_dev = net_dev; hard_iface->soft_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); mutex_init(&hard_iface->bat_iv.ogm_buff_mutex); spin_lock_init(&hard_iface->neigh_list_lock); kref_init(&hard_iface->refcount); hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev); if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; atomic_set(&hard_iface->hop_penalty, 0); batadv_v_hardif_init(hard_iface); batadv_check_known_mac_addr(hard_iface->net_dev); kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); batadv_hardif_generation++; return hard_iface; release_dev: dev_put(net_dev); out: return NULL; } static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) { ASSERT_RTNL(); /* first deactivate interface */ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) batadv_hardif_disable_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) return; hard_iface->if_status = BATADV_IF_TO_BE_REMOVED; batadv_hardif_put(hard_iface); } /** * batadv_hard_if_event_softif() - Handle events for soft interfaces * @event: NETDEV_* event to handle * @net_dev: net_device which generated an event * * Return: NOTIFY_* result */ static int batadv_hard_if_event_softif(unsigned long event, struct net_device *net_dev) { struct batadv_priv *bat_priv; switch (event) { case NETDEV_REGISTER: bat_priv = netdev_priv(net_dev); batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); break; } return NOTIFY_DONE; } static int batadv_hard_if_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *net_dev = netdev_notifier_info_to_dev(ptr); struct batadv_hard_iface *hard_iface; struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; if (batadv_softif_is_valid(net_dev)) return batadv_hard_if_event_softif(event, net_dev); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface && (event == NETDEV_REGISTER || event == NETDEV_POST_TYPE_CHANGE)) hard_iface = batadv_hardif_add_interface(net_dev); if (!hard_iface) goto out; switch (event) { case NETDEV_UP: batadv_hardif_activate_interface(hard_iface); break; case NETDEV_GOING_DOWN: case NETDEV_DOWN: batadv_hardif_deactivate_interface(hard_iface); break; case NETDEV_UNREGISTER: case NETDEV_PRE_TYPE_CHANGE: list_del_rcu(&hard_iface->list); batadv_hardif_generation++; batadv_hardif_remove_interface(hard_iface); break; case NETDEV_CHANGEMTU: if (hard_iface->soft_iface) batadv_update_min_mtu(hard_iface->soft_iface); break; case NETDEV_CHANGEADDR: if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) goto hardif_put; batadv_check_known_mac_addr(hard_iface->net_dev); bat_priv = netdev_priv(hard_iface->soft_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto hardif_put; if (hard_iface == primary_if) batadv_primary_if_update_addr(bat_priv, NULL); break; case NETDEV_CHANGEUPPER: hard_iface->wifi_flags = batadv_wifi_flags_evaluate(net_dev); if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; break; default: break; } hardif_put: batadv_hardif_put(hard_iface); out: batadv_hardif_put(primary_if); return NOTIFY_DONE; } struct notifier_block batadv_hard_if_notifier = { .notifier_call = batadv_hard_if_event, };
770 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_TSTAMP_H #define _NF_CONNTRACK_TSTAMP_H #include <net/net_namespace.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> struct nf_conn_tstamp { u_int64_t start; u_int64_t stop; }; static inline struct nf_conn_tstamp *nf_conn_tstamp_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP return nf_ct_ext_find(ct, NF_CT_EXT_TSTAMP); #else return NULL; #endif } static inline struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP struct net *net = nf_ct_net(ct); if (!net->ct.sysctl_tstamp) return NULL; return nf_ct_ext_add(ct, NF_CT_EXT_TSTAMP, gfp); #else return NULL; #endif }; #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP void nf_conntrack_tstamp_pernet_init(struct net *net); #else static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {} #endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */ #endif /* _NF_CONNTRACK_TSTAMP_H */
2667 2667 2546 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_HOST_H #define _SCSI_SCSI_HOST_H #include <linux/device.h> #include <linux/list.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/blk-mq.h> #include <scsi/scsi.h> struct block_device; struct completion; struct module; struct scsi_cmnd; struct scsi_device; struct scsi_target; struct Scsi_Host; struct scsi_transport_template; #define SG_ALL SG_CHUNK_SIZE #define MODE_UNKNOWN 0x00 #define MODE_INITIATOR 0x01 #define MODE_TARGET 0x02 /** * enum scsi_timeout_action - How to handle a command that timed out. * @SCSI_EH_DONE: The command has already been completed. * @SCSI_EH_RESET_TIMER: Reset the timer and continue waiting for completion. * @SCSI_EH_NOT_HANDLED: The command has not yet finished. Abort the command. */ enum scsi_timeout_action { SCSI_EH_DONE, SCSI_EH_RESET_TIMER, SCSI_EH_NOT_HANDLED, }; struct scsi_host_template { /* * Put fields referenced in IO submission path together in * same cacheline */ /* * Additional per-command data allocated for the driver. */ unsigned int cmd_size; /* * The queuecommand function is used to queue up a scsi * command block to the LLDD. When the driver finished * processing the command the done callback is invoked. * * If queuecommand returns 0, then the driver has accepted the * command. It must also push it to the HBA if the scsi_cmnd * flag SCMD_LAST is set, or if the driver does not implement * commit_rqs. The done() function must be called on the command * when the driver has finished with it. (you may call done on the * command before queuecommand returns, but in this case you * *must* return 0 from queuecommand). * * Queuecommand may also reject the command, in which case it may * not touch the command and must not call done() for it. * * There are two possible rejection returns: * * SCSI_MLQUEUE_DEVICE_BUSY: Block this device temporarily, but * allow commands to other devices serviced by this host. * * SCSI_MLQUEUE_HOST_BUSY: Block all devices served by this * host temporarily. * * For compatibility, any other non-zero return is treated the * same as SCSI_MLQUEUE_HOST_BUSY. * * NOTE: "temporarily" means either until the next command for# * this device/host completes, or a period of time determined by * I/O pressure in the system if there are no other outstanding * commands. * * STATUS: REQUIRED */ int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *); /* * The commit_rqs function is used to trigger a hardware * doorbell after some requests have been queued with * queuecommand, when an error is encountered before sending * the request with SCMD_LAST set. * * STATUS: OPTIONAL */ void (*commit_rqs)(struct Scsi_Host *, u16); struct module *module; const char *name; /* * The info function will return whatever useful information the * developer sees fit. If not provided, then the name field will * be used instead. * * Status: OPTIONAL */ const char *(*info)(struct Scsi_Host *); /* * Ioctl interface * * Status: OPTIONAL */ int (*ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #ifdef CONFIG_COMPAT /* * Compat handler. Handle 32bit ABI. * When unknown ioctl is passed return -ENOIOCTLCMD. * * Status: OPTIONAL */ int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #endif int (*init_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); int (*exit_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); /* * This is an error handling strategy routine. You don't need to * define one of these if you don't want to - there is a default * routine that is present that should work in most cases. For those * driver authors that have the inclination and ability to write their * own strategy routine, this is where it is specified. Note - the * strategy routine is *ALWAYS* run in the context of the kernel eh * thread. Thus you are guaranteed to *NOT* be in an interrupt * handler when you execute this, and you are also guaranteed to * *NOT* have any other commands being queued while you are in the * strategy routine. When you return from this function, operations * return to normal. * * See scsi_error.c scsi_unjam_host for additional comments about * what this function should and should not be attempting to do. * * Status: REQUIRED (at least one of them) */ int (* eh_abort_handler)(struct scsi_cmnd *); int (* eh_device_reset_handler)(struct scsi_cmnd *); int (* eh_target_reset_handler)(struct scsi_cmnd *); int (* eh_bus_reset_handler)(struct scsi_cmnd *); int (* eh_host_reset_handler)(struct scsi_cmnd *); /* * Before the mid layer attempts to scan for a new device where none * currently exists, it will call this entry in your driver. Should * your driver need to allocate any structs or perform any other init * items in order to send commands to a currently unused target/lun * combo, then this is where you can perform those allocations. This * is specifically so that drivers won't have to perform any kind of * "is this a new device" checks in their queuecommand routine, * thereby making the hot path a bit quicker. * * Return values: 0 on success, non-0 on failure * * Deallocation: If we didn't find any devices at this ID, you will * get an immediate call to slave_destroy(). If we find something * here then you will get a call to slave_configure(), then the * device will be used for however long it is kept around, then when * the device is removed from the system (or * possibly at reboot * time), you will then get a call to slave_destroy(). This is * assuming you implement slave_configure and slave_destroy. * However, if you allocate memory and hang it off the device struct, * then you must implement the slave_destroy() routine at a minimum * in order to avoid leaking memory * each time a device is tore down. * * Status: OPTIONAL */ int (* slave_alloc)(struct scsi_device *); /* * Once the device has responded to an INQUIRY and we know the * device is online, we call into the low level driver with the * struct scsi_device *. If the low level device driver implements * this function, it *must* perform the task of setting the queue * depth on the device. All other tasks are optional and depend * on what the driver supports and various implementation details. * * Things currently recommended to be handled at this time include: * * 1. Setting the device queue depth. Proper setting of this is * described in the comments for scsi_change_queue_depth. * 2. Determining if the device supports the various synchronous * negotiation protocols. The device struct will already have * responded to INQUIRY and the results of the standard items * will have been shoved into the various device flag bits, eg. * device->sdtr will be true if the device supports SDTR messages. * 3. Allocating command structs that the device will need. * 4. Setting the default timeout on this device (if needed). * 5. Anything else the low level driver might want to do on a device * specific setup basis... * 6. Return 0 on success, non-0 on error. The device will be marked * as offline on error so that no access will occur. If you return * non-0, your slave_destroy routine will never get called for this * device, so don't leave any loose memory hanging around, clean * up after yourself before returning non-0 * * Status: OPTIONAL */ int (* slave_configure)(struct scsi_device *); /* * Immediately prior to deallocating the device and after all activity * has ceased the mid layer calls this point so that the low level * driver may completely detach itself from the scsi device and vice * versa. The low level driver is responsible for freeing any memory * it allocated in the slave_alloc or slave_configure calls. * * Status: OPTIONAL */ void (* slave_destroy)(struct scsi_device *); /* * Before the mid layer attempts to scan for a new device attached * to a target where no target currently exists, it will call this * entry in your driver. Should your driver need to allocate any * structs or perform any other init items in order to send commands * to a currently unused target, then this is where you can perform * those allocations. * * Return values: 0 on success, non-0 on failure * * Status: OPTIONAL */ int (* target_alloc)(struct scsi_target *); /* * Immediately prior to deallocating the target structure, and * after all activity to attached scsi devices has ceased, the * midlayer calls this point so that the driver may deallocate * and terminate any references to the target. * * Note: This callback is called with the host lock held and hence * must not sleep. * * Status: OPTIONAL */ void (* target_destroy)(struct scsi_target *); /* * If a host has the ability to discover targets on its own instead * of scanning the entire bus, it can fill in this function and * call scsi_scan_host(). This function will be called periodically * until it returns 1 with the scsi_host and the elapsed time of * the scan in jiffies. * * Status: OPTIONAL */ int (* scan_finished)(struct Scsi_Host *, unsigned long); /* * If the host wants to be called before the scan starts, but * after the midlayer has set up ready for the scan, it can fill * in this function. * * Status: OPTIONAL */ void (* scan_start)(struct Scsi_Host *); /* * Fill in this function to allow the queue depth of this host * to be changeable (on a per device basis). Returns either * the current queue depth setting (may be different from what * was passed in) or an error. An error should only be * returned if the requested depth is legal but the driver was * unable to set it. If the requested depth is illegal, the * driver should set and return the closest legal queue depth. * * Status: OPTIONAL */ int (* change_queue_depth)(struct scsi_device *, int); /* * This functions lets the driver expose the queue mapping * to the block layer. * * Status: OPTIONAL */ void (* map_queues)(struct Scsi_Host *shost); /* * SCSI interface of blk_poll - poll for IO completions. * Only applicable if SCSI LLD exposes multiple h/w queues. * * Return value: Number of completed entries found. * * Status: OPTIONAL */ int (* mq_poll)(struct Scsi_Host *shost, unsigned int queue_num); /* * Check if scatterlists need to be padded for DMA draining. * * Status: OPTIONAL */ bool (* dma_need_drain)(struct request *rq); /* * This function determines the BIOS parameters for a given * harddisk. These tend to be numbers that are made up by * the host adapter. Parameters: * size, device, list (heads, sectors, cylinders) * * Status: OPTIONAL */ int (* bios_param)(struct scsi_device *, struct block_device *, sector_t, int []); /* * This function is called when one or more partitions on the * device reach beyond the end of the device. * * Status: OPTIONAL */ void (*unlock_native_capacity)(struct scsi_device *); /* * Can be used to export driver statistics and other infos to the * world outside the kernel ie. userspace and it also provides an * interface to feed the driver with information. * * Status: OBSOLETE */ int (*show_info)(struct seq_file *, struct Scsi_Host *); int (*write_info)(struct Scsi_Host *, char *, int); /* * This is an optional routine that allows the transport to become * involved when a scsi io timer fires. The return value tells the * timer routine how to finish the io timeout handling. * * Status: OPTIONAL */ enum scsi_timeout_action (*eh_timed_out)(struct scsi_cmnd *); /* * Optional routine that allows the transport to decide if a cmd * is retryable. Return true if the transport is in a state the * cmd should be retried on. */ bool (*eh_should_retry_cmd)(struct scsi_cmnd *scmd); /* This is an optional routine that allows transport to initiate * LLD adapter or firmware reset using sysfs attribute. * * Return values: 0 on success, -ve value on failure. * * Status: OPTIONAL */ int (*host_reset)(struct Scsi_Host *shost, int reset_type); #define SCSI_ADAPTER_RESET 1 #define SCSI_FIRMWARE_RESET 2 /* * Name of proc directory */ const char *proc_name; /* * This determines if we will use a non-interrupt driven * or an interrupt driven scheme. It is set to the maximum number * of simultaneous commands a single hw queue in HBA will accept. */ int can_queue; /* * In many instances, especially where disconnect / reconnect are * supported, our host also has an ID on the SCSI bus. If this is * the case, then it must be reserved. Please set this_id to -1 if * your setup is in single initiator mode, and the host lacks an * ID. */ int this_id; /* * This determines the degree to which the host adapter is capable * of scatter-gather. */ unsigned short sg_tablesize; unsigned short sg_prot_tablesize; /* * Set this if the host adapter has limitations beside segment count. */ unsigned int max_sectors; /* * Maximum size in bytes of a single segment. */ unsigned int max_segment_size; /* * DMA scatter gather segment boundary limit. A segment crossing this * boundary will be split in two. */ unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * This specifies "machine infinity" for host templates which don't * limit the transfer size. Note this limit represents an absolute * maximum, and may be over the transfer limits allowed for * individual devices (e.g. 256 for SCSI-1). */ #define SCSI_DEFAULT_MAX_SECTORS 1024 /* * True if this host adapter can make good use of linked commands. * This will allow more than one command to be queued to a given * unit on a given host. Set this to the maximum number of command * blocks to be provided for each device. Set this to 1 for one * command block per lun, 2 for two, etc. Do not set this to 0. * You should make sure that the host adapter will do the right thing * before you try setting this above 1. */ short cmd_per_lun; /* If use block layer to manage tags, this is tag allocation policy */ int tag_alloc_policy; /* * Track QUEUE_FULL events and reduce queue depth on demand. */ unsigned track_queue_depth:1; /* * This specifies the mode that a LLD supports. */ unsigned supported_mode:2; /* * True for emulated SCSI host adapters (e.g. ATAPI). */ unsigned emulated:1; /* * True if the low-level driver performs its own reset-settle delays. */ unsigned skip_settle_delay:1; /* True if the controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* The queuecommand callback may block. See also BLK_MQ_F_BLOCKING. */ unsigned queuecommand_may_block:1; /* * Countdown for host blocking with no commands outstanding. */ unsigned int max_host_blocked; /* * Default value for the blocking. If the queue is empty, * host_blocked counts down in the request_fn until it restarts * host operations as zero is reached. * * FIXME: This should probably be a value in the template */ #define SCSI_DEFAULT_HOST_BLOCKED 7 /* * Pointer to the SCSI host sysfs attribute groups, NULL terminated. */ const struct attribute_group **shost_groups; /* * Pointer to the SCSI device attribute groups for this host, * NULL terminated. */ const struct attribute_group **sdev_groups; /* * Vendor Identifier associated with the host * * Note: When specifying vendor_id, be sure to read the * Vendor Type and ID formatting requirements specified in * scsi_netlink.h */ u64 vendor_id; /* Delay for runtime autosuspend */ int rpm_autosuspend_delay; }; /* * Temporary #define for host lock push down. Can be removed when all * drivers have been updated to take advantage of unlocked * queuecommand. * */ #define DEF_SCSI_QCMD(func_name) \ int func_name(struct Scsi_Host *shost, struct scsi_cmnd *cmd) \ { \ unsigned long irq_flags; \ int rc; \ spin_lock_irqsave(shost->host_lock, irq_flags); \ rc = func_name##_lck(cmd); \ spin_unlock_irqrestore(shost->host_lock, irq_flags); \ return rc; \ } /* * shost state: If you alter this, you also need to alter scsi_sysfs.c * (for the ascii descriptions) and the state model enforcer: * scsi_host_set_state() */ enum scsi_host_state { SHOST_CREATED = 1, SHOST_RUNNING, SHOST_CANCEL, SHOST_DEL, SHOST_RECOVERY, SHOST_CANCEL_RECOVERY, SHOST_DEL_RECOVERY, }; struct Scsi_Host { /* * __devices is protected by the host_lock, but you should * usually use scsi_device_lookup / shost_for_each_device * to access it and don't care about locking yourself. * In the rare case of being in irq context you can use * their __ prefixed variants with the lock held. NEVER * access this list directly from a driver. */ struct list_head __devices; struct list_head __targets; struct list_head starved_list; spinlock_t default_lock; spinlock_t *host_lock; struct mutex scan_mutex;/* serialize scanning activity */ struct list_head eh_abort_list; struct list_head eh_cmd_q; struct task_struct * ehandler; /* Error recovery thread. */ struct completion * eh_action; /* Wait for specific actions on the host. */ wait_queue_head_t host_wait; const struct scsi_host_template *hostt; struct scsi_transport_template *transportt; struct kref tagset_refcnt; struct completion tagset_freed; /* Area to keep a shared tag map */ struct blk_mq_tag_set tag_set; atomic_t host_blocked; unsigned int host_failed; /* commands that failed. protected by host_lock */ unsigned int host_eh_scheduled; /* EH scheduled without command */ unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */ /* next two fields are used to bound the time spent in error handling */ int eh_deadline; unsigned long last_reset; /* * These three parameters can be used to allow for wide scsi, * and for host adapters that support multiple busses * The last two should be set to 1 more than the actual max id * or lun (e.g. 8 for SCSI parallel systems). */ unsigned int max_channel; unsigned int max_id; u64 max_lun; /* * This is a unique identifier that must be assigned so that we * have some way of identifying each detected host adapter properly * and uniquely. For hosts that do not support more than one card * in the system at one time, this does not need to be set. It is * initialized to 0 in scsi_register. */ unsigned int unique_id; /* * The maximum length of SCSI commands that this host can accept. * Probably 12 for most host adapters, but could be 16 for others. * or 260 if the driver supports variable length cdbs. * For drivers that don't set this field, a value of 12 is * assumed. */ unsigned short max_cmd_len; int this_id; int can_queue; short cmd_per_lun; short unsigned int sg_tablesize; short unsigned int sg_prot_tablesize; unsigned int max_sectors; unsigned int opt_sectors; unsigned int max_segment_size; unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * In scsi-mq mode, the number of hardware queues supported by the LLD. * * Note: it is assumed that each hardware queue has a queue depth of * can_queue. In other words, the total queue depth per host * is nr_hw_queues * can_queue. However, for when host_tagset is set, * the total queue depth is can_queue. */ unsigned nr_hw_queues; unsigned nr_maps; unsigned active_mode:2; /* * Host has requested that no further requests come through for the * time being. */ unsigned host_self_blocked:1; /* * Host uses correct SCSI ordering not PC ordering. The bit is * set for the minority of drivers whose authors actually read * the spec ;). */ unsigned reverse_ordering:1; /* Task mgmt function in progress */ unsigned tmf_in_progress:1; /* Asynchronous scan in progress */ unsigned async_scan:1; /* Don't resume host in EH */ unsigned eh_noresume:1; /* The controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* The queuecommand callback may block. See also BLK_MQ_F_BLOCKING. */ unsigned queuecommand_may_block:1; /* Host responded with short (<36 bytes) INQUIRY result */ unsigned short_inquiry:1; /* The transport requires the LUN bits NOT to be stored in CDB[1] */ unsigned no_scsi2_lun_in_cdb:1; /* * Optional work queue to be utilized by the transport */ char work_q_name[20]; struct workqueue_struct *work_q; /* * Task management function work queue */ struct workqueue_struct *tmf_work_q; /* * Value host_blocked counts down from */ unsigned int max_host_blocked; /* Protection Information */ unsigned int prot_capabilities; unsigned char prot_guard_type; /* legacy crap */ unsigned long base; unsigned long io_port; unsigned char n_io_port; unsigned char dma_channel; unsigned int irq; enum scsi_host_state shost_state; /* ldm bits */ struct device shost_gendev, shost_dev; /* * Points to the transport data (if any) which is allocated * separately */ void *shost_data; /* * Points to the physical bus device we'd use to do DMA * Needed just in case we have virtual hosts. */ struct device *dma_dev; /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. */ unsigned long hostdata[] /* Used for storage of host specific stuff */ __attribute__ ((aligned (sizeof(unsigned long)))); }; #define class_to_shost(d) \ container_of(d, struct Scsi_Host, shost_dev) #define shost_printk(prefix, shost, fmt, a...) \ dev_printk(prefix, &(shost)->shost_gendev, fmt, ##a) static inline void *shost_priv(struct Scsi_Host *shost) { return (void *)shost->hostdata; } int scsi_is_host_device(const struct device *); static inline struct Scsi_Host *dev_to_shost(struct device *dev) { while (!scsi_is_host_device(dev)) { if (!dev->parent) return NULL; dev = dev->parent; } return container_of(dev, struct Scsi_Host, shost_gendev); } static inline int scsi_host_in_recovery(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RECOVERY || shost->shost_state == SHOST_CANCEL_RECOVERY || shost->shost_state == SHOST_DEL_RECOVERY || shost->tmf_in_progress; } extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); extern void scsi_flush_work(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_alloc(const struct scsi_host_template *, int); extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, struct device *, struct device *); #if defined(CONFIG_SCSI_PROC_FS) struct proc_dir_entry * scsi_template_proc_dir(const struct scsi_host_template *sht); #else #define scsi_template_proc_dir(sht) NULL #endif extern void scsi_scan_host(struct Scsi_Host *); extern int scsi_rescan_device(struct scsi_device *sdev); extern void scsi_remove_host(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *); extern int scsi_host_busy(struct Scsi_Host *shost); extern void scsi_host_put(struct Scsi_Host *t); extern struct Scsi_Host *scsi_host_lookup(unsigned int hostnum); extern const char *scsi_host_state_name(enum scsi_host_state); extern void scsi_host_complete_all_commands(struct Scsi_Host *shost, enum scsi_host_status status); static inline int __must_check scsi_add_host(struct Scsi_Host *host, struct device *dev) { return scsi_add_host_with_dma(host, dev, dev); } static inline struct device *scsi_get_device(struct Scsi_Host *shost) { return shost->shost_gendev.parent; } /** * scsi_host_scan_allowed - Is scanning of this host allowed * @shost: Pointer to Scsi_Host. **/ static inline int scsi_host_scan_allowed(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RUNNING || shost->shost_state == SHOST_RECOVERY; } extern void scsi_unblock_requests(struct Scsi_Host *); extern void scsi_block_requests(struct Scsi_Host *); extern int scsi_host_block(struct Scsi_Host *shost); extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state); void scsi_host_busy_iter(struct Scsi_Host *, bool (*fn)(struct scsi_cmnd *, void *), void *priv); struct class_container; /* * DIF defines the exchange of protection information between * initiator and SBC block device. * * DIX defines the exchange of protection information between OS and * initiator. */ enum scsi_host_prot_capabilities { SHOST_DIF_TYPE1_PROTECTION = 1 << 0, /* T10 DIF Type 1 */ SHOST_DIF_TYPE2_PROTECTION = 1 << 1, /* T10 DIF Type 2 */ SHOST_DIF_TYPE3_PROTECTION = 1 << 2, /* T10 DIF Type 3 */ SHOST_DIX_TYPE0_PROTECTION = 1 << 3, /* DIX between OS and HBA only */ SHOST_DIX_TYPE1_PROTECTION = 1 << 4, /* DIX with DIF Type 1 */ SHOST_DIX_TYPE2_PROTECTION = 1 << 5, /* DIX with DIF Type 2 */ SHOST_DIX_TYPE3_PROTECTION = 1 << 6, /* DIX with DIF Type 3 */ }; /* * SCSI hosts which support the Data Integrity Extensions must * indicate their capabilities by setting the prot_capabilities using * this call. */ static inline void scsi_host_set_prot(struct Scsi_Host *shost, unsigned int mask) { shost->prot_capabilities = mask; } static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost) { return shost->prot_capabilities; } static inline int scsi_host_prot_dma(struct Scsi_Host *shost) { return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION; } static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type) { static unsigned char cap[] = { 0, SHOST_DIF_TYPE1_PROTECTION, SHOST_DIF_TYPE2_PROTECTION, SHOST_DIF_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type] ? target_type : 0; } static inline unsigned int scsi_host_dix_capable(struct Scsi_Host *shost, unsigned int target_type) { #if defined(CONFIG_BLK_DEV_INTEGRITY) static unsigned char cap[] = { SHOST_DIX_TYPE0_PROTECTION, SHOST_DIX_TYPE1_PROTECTION, SHOST_DIX_TYPE2_PROTECTION, SHOST_DIX_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type]; #endif return 0; } /* * All DIX-capable initiators must support the T10-mandated CRC * checksum. Controllers can optionally implement the IP checksum * scheme which has much lower impact on system performance. Note * that the main rationale for the checksum is to match integrity * metadata with data. Detecting bit errors are a job for ECC memory * and buses. */ enum scsi_host_guard_type { SHOST_DIX_GUARD_CRC = 1 << 0, SHOST_DIX_GUARD_IP = 1 << 1, }; static inline void scsi_host_set_guard(struct Scsi_Host *shost, unsigned char type) { shost->prot_guard_type = type; } static inline unsigned char scsi_host_get_guard(struct Scsi_Host *shost) { return shost->prot_guard_type; } extern int scsi_host_set_state(struct Scsi_Host *, enum scsi_host_state); #endif /* _SCSI_SCSI_HOST_H */
3 3 3 3 3 2 1 3 1 2 3 1 1 1 1 1 1 9 9 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 3 3 3 3 3 1 9 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-or-later /* Manage a process's keyrings * * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> #include <linux/init_task.h> #include <keys/request_key_auth-type.h> #include "internal.h" /* Session keyring create vs join semaphore */ static DEFINE_MUTEX(key_session_mutex); /* The root user's tracking struct */ struct key_user root_key_user = { .usage = REFCOUNT_INIT(3), .cons_lock = __MUTEX_INITIALIZER(root_key_user.cons_lock), .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = GLOBAL_ROOT_UID, }; /* * Get or create a user register keyring. */ static struct key *get_user_register(struct user_namespace *user_ns) { struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register); if (reg_keyring) return reg_keyring; down_write(&user_ns->keyring_sem); /* Make sure there's a register keyring. It gets owned by the * user_namespace's owner. */ reg_keyring = user_ns->user_keyring_register; if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, &init_cred, KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); } up_write(&user_ns->keyring_sem); /* We don't return a ref since the keyring is pinned by the user_ns */ return reg_keyring; } /* * Look up the user and user session keyrings for the current process's UID, * creating them if they don't exist. */ int look_up_user_keyrings(struct key **_user_keyring, struct key **_user_session_keyring) { const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; kenter("%u", uid); reg_keyring = get_user_register(user_ns); if (IS_ERR(reg_keyring)) return PTR_ERR(reg_keyring); down_write(&user_ns->keyring_sem); ret = 0; /* Get the user keyring. Note that there may be one in existence * already as it may have been pinned by a session, but the user_struct * pointing to it may have been destroyed by setuid. */ snprintf(buf, sizeof(buf), "_uid.%u", uid); uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; } } else if (IS_ERR(uid_keyring_r)) { ret = PTR_ERR(uid_keyring_r); goto error; } else { uid_keyring = key_ref_to_ptr(uid_keyring_r); } /* Get a default session keyring (which might also exist already) */ snprintf(buf, sizeof(buf), "_uid_ses.%u", uid); session_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; } /* We install a link from the user session keyring to * the user keyring. */ ret = key_link(session_keyring, uid_keyring); if (ret < 0) goto error_release_session; /* And only then link the user-session keyring to the * register. */ ret = key_link(reg_keyring, session_keyring); if (ret < 0) goto error_release_session; } else if (IS_ERR(session_keyring_r)) { ret = PTR_ERR(session_keyring_r); goto error_release; } else { session_keyring = key_ref_to_ptr(session_keyring_r); } up_write(&user_ns->keyring_sem); if (_user_session_keyring) *_user_session_keyring = session_keyring; else key_put(session_keyring); if (_user_keyring) *_user_keyring = uid_keyring; else key_put(uid_keyring); kleave(" = 0"); return 0; error_release_session: key_put(session_keyring); error_release: key_put(uid_keyring); error: up_write(&user_ns->keyring_sem); kleave(" = %d", ret); return ret; } /* * Get the user session keyring if it exists, but don't create it if it * doesn't. */ struct key *get_user_session_keyring_rcu(const struct cred *cred) { struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register); key_ref_t session_keyring_r; char buf[20]; struct keyring_search_context ctx = { .index_key.type = &key_type_keyring, .index_key.description = buf, .cred = cred, .match_data.cmp = key_default_cmp, .match_data.raw_data = buf, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; if (!reg_keyring) return NULL; ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u", from_kuid(cred->user_ns, cred->user->uid)); session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true), &ctx); if (IS_ERR(session_keyring_r)) return NULL; return key_ref_to_ptr(session_keyring_r); } /* * Install a thread keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->thread_keyring) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->thread_keyring = keyring; return 0; } /* * Install a thread keyring to the current task if it didn't have one already. * * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install a process keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->process_keyring = keyring; return 0; } /* * Install a process keyring to the current task if it didn't have one already. * * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install the given keyring as the session keyring of the given credentials * struct, replacing the existing one if any. If the given keyring is NULL, * then install a new anonymous session keyring. * @cred can not be in use by any task yet. * * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { unsigned long flags; struct key *old; might_sleep(); /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { __key_get(keyring); } /* install the keyring */ old = cred->session_keyring; cred->session_keyring = keyring; if (old) key_put(old); return 0; } /* * Install the given keyring as the session keyring of the current task, * replacing the existing one if any. If the given keyring is NULL, then * install a new anonymous session keyring. * * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Handle the fsuid changing. */ void key_fsuid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->uid = new_cred->fsuid; up_write(&new_cred->thread_keyring->sem); } } /* * Handle the fsgid changing. */ void key_fsgid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->gid = new_cred->fsgid; up_write(&new_cred->thread_keyring->sem); } } /* * Search the process keyrings attached to the supplied cred for the first * matching key under RCU conditions (the caller must be holding the RCU read * lock). * * The search criteria are the type and the match function. The description is * given to the match function as a parameter, but doesn't otherwise influence * the search. Typically the match function will compare the description * parameter to the key's description. * * This can only search keyrings that grant Search permission to the supplied * credentials. Keyrings linked to searched keyrings will also be searched if * they grant Search permission too. Keys can only be found if they grant * Search permission to the credentials. * * Returns a pointer to the key with the key usage count incremented if * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only * matched negative keys. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx) { struct key *user_session; key_ref_t key_ref, ret, err; const struct cred *cred = ctx->cred; /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if * none of the keyrings were searchable * * in terms of priority: success > -ENOKEY > -EAGAIN > other error */ key_ref = NULL; ret = NULL; err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ if (cred->thread_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the process keyring second */ if (cred->process_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the session keyring */ if (cred->session_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->session_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* or search the user-session keyring */ else if ((user_session = get_user_session_keyring_rcu(cred))) { key_ref = keyring_search_rcu(make_key_ref(user_session, 1), ctx); key_put(user_session); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* no key - decide on the error we're going to go for */ key_ref = ret ? ret : err; found: return key_ref; } /* * Search the process keyrings attached to the supplied cred for the first * matching key in the manner of search_my_process_keyrings(), but also search * the keys attached to the assumed authorisation key using its credentials if * one is available. * * The caller must be holding the RCU read lock. * * Return same as search_cred_keyrings_rcu(). */ key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; key_ref = search_cred_keyrings_rcu(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; /* if this process has an instantiation authorisation key, then we also * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ if (ctx->cred->request_key_auth && ctx->cred == current_cred() && ctx->index_key.type != &key_type_request_key_auth ) { const struct cred *cred = ctx->cred; if (key_validate(cred->request_key_auth) == 0) { rka = ctx->cred->request_key_auth->payload.data[0]; //// was search_process_keyrings() [ie. recursive] ctx->cred = rka->cred; key_ref = search_cred_keyrings_rcu(ctx); ctx->cred = cred; if (!IS_ERR(key_ref)) goto found; ret = key_ref; } } /* no key - decide on the error we're going to go for */ if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY)) key_ref = ERR_PTR(-ENOKEY); else if (err == ERR_PTR(-EACCES)) key_ref = ret; else key_ref = err; found: return key_ref; } /* * See if the key we're looking at is the target key. */ bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data) { return key == match_data->raw_data; } /* * Look up a key ID given us by userspace with a given permissions mask to get * the key it refers to. * * Flags can be passed to request that special keyrings be created if referred * to directly, to permit partially constructed keys to be found and to skip * validity and permission checks on the found key. * * Returns a pointer to the key with an incremented usage count if successful; * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond * to a key or the best found key was a negative key; -EKEYREVOKED or * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the * found key doesn't grant the requested permit or the LSM denied access to it; * or -ENOMEM if a special keyring couldn't be created. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; struct request_key_auth *rka; struct key *key, *user_session; key_ref_t key_ref, skey_ref; int ret; try_again: ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->thread_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->process_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error; if (lflags & KEY_LOOKUP_CREATE) ret = join_session_keyring(NULL); else ret = install_session_keyring(user_session); key_put(user_session); if (ret < 0) goto error; goto reget_creds; } else if (test_bit(KEY_FLAG_UID_KEYRING, &ctx.cred->session_keyring->flags) && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) goto error; goto reget_creds; } key = ctx.cred->session_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: ret = look_up_user_keyrings(&key, NULL); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &key); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_GROUP_KEYRING: /* group keyrings are not yet supported */ key_ref = ERR_PTR(-EINVAL); goto error; case KEY_SPEC_REQKEY_AUTH_KEY: key = ctx.cred->request_key_auth; if (!key) goto error; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: if (!ctx.cred->request_key_auth) goto error; down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { rka = ctx.cred->request_key_auth->payload.data[0]; key = rka->dest_keyring; __key_get(key); } up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); break; default: key_ref = ERR_PTR(-EINVAL); if (id < 1) goto error; key = key_lookup(id); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error; } key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ ctx.index_key = key->index_key; ctx.match_data.raw_data = key; kdebug("check possessed"); rcu_read_lock(); skey_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); key_ref = skey_ref; } break; } /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ if (need_perm != KEY_NEED_UNLINK) { if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: goto invalid_key; default: if (need_perm != KEY_AUTHTOKEN_OVERRIDE && need_perm != KEY_DEFER_PERM_CHECK) goto invalid_key; break; case 0: break; } } else if (need_perm != KEY_DEFER_PERM_CHECK) { ret = key_validate(key); if (ret < 0) goto invalid_key; } ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } /* check the permissions */ ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; key->last_used_at = ktime_get_real_seconds(); error: put_cred(ctx.cred); return key_ref; invalid_key: key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error; /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: put_cred(ctx.cred); goto try_again; } EXPORT_SYMBOL(lookup_user_key); /* * Join the named keyring as the session keyring if possible else attempt to * create a new one of that name and join that. * * If the name is NULL, an empty anonymous keyring will be installed as the * session keyring. * * Named session keyrings are joined with a semaphore held to prevent the * keyrings from going away whilst the attempt is made to going them and also * to prevent a race in creating compatible session keyrings. */ long join_session_keyring(const char *name) { const struct cred *old; struct cred *new; struct key *keyring; long ret, serial; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; goto okay; } /* allow the user to join or create a named keyring */ mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( name, old->uid, old->gid, old, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } else if (keyring == new->session_keyring) { ret = 0; goto error3; } /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error3; commit_creds(new); mutex_unlock(&key_session_mutex); ret = keyring->serial; key_put(keyring); okay: return ret; error3: key_put(keyring); error2: mutex_unlock(&key_session_mutex); error: abort_creds(new); return ret; } /* * Replace a process's session keyring on behalf of one of its children when * the target process is about to resume userspace execution. */ void key_change_session_keyring(struct callback_head *twork) { const struct cred *old = current_cred(); struct cred *new = container_of(twork, struct cred, rcu); if (unlikely(current->flags & PF_EXITING)) { put_cred(new); return; } /* If get_ucounts fails more bits are needed in the refcount */ if (unlikely(!get_ucounts(old->ucounts))) { WARN_ONCE(1, "In %s get_ucounts failed\n", __func__); put_cred(new); return; } new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; new->fsuid = old->fsuid; new-> gid = old-> gid; new-> egid = old-> egid; new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); new->ucounts = old->ucounts; new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); new->securebits = old->securebits; new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); commit_creds(new); } /* * Make sure that root's user and user-session keyrings exist. */ static int __init init_root_keyring(void) { return look_up_user_keyrings(NULL, NULL); } late_initcall(init_root_keyring);
1 1 1 1 2 2 2 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 wext compat for managed mode. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2009, 2020-2023 Intel Corporation */ #include <linux/export.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include "wext-compat.h" #include "nl80211.h" int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; const u8 *prev_bssid = NULL; int err, i; ASSERT_RTNL(); lockdep_assert_wiphy(wdev->wiphy); if (!netif_running(wdev->netdev)) return 0; wdev->wext.connect.ie = wdev->wext.ie; wdev->wext.connect.ie_len = wdev->wext.ie_len; /* Use default background scan period */ wdev->wext.connect.bg_scan_period = -1; if (wdev->wext.keys) { wdev->wext.keys->def = wdev->wext.default_key; if (wdev->wext.default_key != -1) wdev->wext.connect.privacy = true; } if (!wdev->wext.connect.ssid_len) return 0; if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } if (wdev->wext.prev_bssid_valid) prev_bssid = wdev->wext.prev_bssid; err = cfg80211_connect(rdev, wdev->netdev, &wdev->wext.connect, ck, prev_bssid); if (err) kfree_sensitive(ck); return err; } int cfg80211_mgd_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq) { chan = ieee80211_get_channel(wdev->wiphy, freq); if (!chan) return -EINVAL; if (chan->flags & IEEE80211_CHAN_DISABLED) return -EINVAL; } if (wdev->conn) { bool event = true; if (wdev->wext.connect.channel == chan) return 0; /* if SSID set, we'll try right again, avoid event */ if (wdev->wext.connect.ssid_len) event = false; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, event); if (err) return err; } wdev->wext.connect.channel = chan; return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_channel *chan = NULL; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (wdev->valid_links) return -EOPNOTSUPP; if (wdev->links[0].client.current_bss) chan = wdev->links[0].client.current_bss->pub.channel; else if (wdev->wext.connect.channel) chan = wdev->wext.connect.channel; if (chan) { freq->m = chan->center_freq; freq->e = 6; return 0; } /* no channel if not joining */ return -EINVAL; } int cfg80211_mgd_wext_siwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (!data->flags) len = 0; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; if (wdev->conn) { bool event = true; if (wdev->wext.connect.ssid && len && len == wdev->wext.connect.ssid_len && memcmp(wdev->wext.connect.ssid, ssid, len) == 0) return 0; /* if SSID set now, we'll try to connect, avoid event */ if (len) event = false; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, event); if (err) return err; } wdev->wext.prev_bssid_valid = false; wdev->wext.connect.ssid = wdev->wext.ssid; memcpy(wdev->wext.ssid, ssid, len); wdev->wext.connect.ssid_len = len; wdev->wext.connect.crypto.control_port = false; wdev->wext.connect.crypto.control_port_ethertype = cpu_to_be16(ETH_P_PAE); return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; int ret = 0; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (wdev->valid_links) return -EINVAL; data->flags = 0; if (wdev->links[0].client.current_bss) { const struct element *ssid_elem; rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem( &wdev->links[0].client.current_bss->pub, WLAN_EID_SSID); if (ssid_elem) { data->flags = 1; data->length = ssid_elem->datalen; if (data->length > IW_ESSID_MAX_SIZE) ret = -EINVAL; else memcpy(ssid, ssid_elem->data, data->length); } rcu_read_unlock(); } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { data->flags = 1; data->length = wdev->wext.connect.ssid_len; memcpy(ssid, wdev->wext.connect.ssid, data->length); } return ret; } int cfg80211_mgd_wext_siwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (ap_addr->sa_family != ARPHRD_ETHER) return -EINVAL; /* automatic mode */ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) bssid = NULL; if (wdev->conn) { /* both automatic */ if (!bssid && !wdev->wext.connect.bssid) return 0; /* fixed already - and no change */ if (wdev->wext.connect.bssid && bssid && ether_addr_equal(bssid, wdev->wext.connect.bssid)) return 0; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, false); if (err) return err; } if (bssid) { memcpy(wdev->wext.bssid, bssid, ETH_ALEN); wdev->wext.connect.bssid = wdev->wext.bssid; } else wdev->wext.connect.bssid = NULL; return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; ap_addr->sa_family = ARPHRD_ETHER; if (wdev->valid_links) return -EOPNOTSUPP; if (wdev->links[0].client.current_bss) memcpy(ap_addr->sa_data, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); else eth_zero_addr(ap_addr->sa_data); return 0; } int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *ie = extra; int ie_len = data->length, err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!ie_len) ie = NULL; wiphy_lock(wdev->wiphy); /* no change */ err = 0; if (wdev->wext.ie_len == ie_len && memcmp(wdev->wext.ie, ie, ie_len) == 0) goto out; if (ie_len) { ie = kmemdup(extra, ie_len, GFP_KERNEL); if (!ie) { err = -ENOMEM; goto out; } } else ie = NULL; kfree(wdev->wext.ie); wdev->wext.ie = ie; wdev->wext.ie_len = ie_len; if (wdev->conn) { err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, false); if (err) goto out; } /* userspace better not think we'll reconnect */ err = 0; out: wiphy_unlock(wdev->wiphy); return err; } int cfg80211_wext_siwmlme(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_mlme *mlme = (struct iw_mlme *)extra; struct cfg80211_registered_device *rdev; int err; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; if (mlme->addr.sa_family != ARPHRD_ETHER) return -EINVAL; wiphy_lock(&rdev->wiphy); switch (mlme->cmd) { case IW_MLME_DEAUTH: case IW_MLME_DISASSOC: err = cfg80211_disconnect(rdev, dev, mlme->reason_code, true); break; default: err = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return err; }
8926 3088 1177 374 776 815 1206 1204 374 666 245 497 8 4 4 8 4 4 1389 1389 1385 1317 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */
1860 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 /* SPDX-License-Identifier: GPL-2.0 */ /* * This is <linux/capability.h> * * Andrew G. Morgan <morgan@kernel.org> * Alexander Kjeldaas <astor@guardian.no> * with help from Aleph1, Roland Buresund and Andrew Main. * * See here for the libcap library ("POSIX draft" compliance): * * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> #include <linux/uidgid.h> #include <linux/bits.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 extern int file_caps_enabled; typedef struct { u64 val; } kernel_cap_t; /* same as vfs_ns_cap_data but in cpu endian and always filled completely */ struct cpu_vfs_cap_data { __u32 magic_etc; kuid_t rootid; kernel_cap_t permitted; kernel_cap_t inheritable; }; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) struct file; struct inode; struct dentry; struct task_struct; struct user_namespace; struct mnt_idmap; /* * CAP_FS_MASK and CAP_NFSD_MASKS: * * The fs mask is all the privileges that fsuid==0 historically meant. * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE. * * It has never meant setting security.* and trusted.* xattrs. * * We could also define fsmask as follows: * 1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions * 2. The security.* and trusted.* xattrs are fs-related MAC permissions */ # define CAP_FS_MASK (BIT_ULL(CAP_CHOWN) \ | BIT_ULL(CAP_MKNOD) \ | BIT_ULL(CAP_DAC_OVERRIDE) \ | BIT_ULL(CAP_DAC_READ_SEARCH) \ | BIT_ULL(CAP_FOWNER) \ | BIT_ULL(CAP_FSETID) \ | BIT_ULL(CAP_MAC_OVERRIDE)) #define CAP_VALID_MASK (BIT_ULL(CAP_LAST_CAP+1)-1) # define CAP_EMPTY_SET ((kernel_cap_t) { 0 }) # define CAP_FULL_SET ((kernel_cap_t) { CAP_VALID_MASK }) # define CAP_FS_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) }) # define CAP_NFSD_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) }) # define cap_clear(c) do { (c).val = 0; } while (0) #define cap_raise(c, flag) ((c).val |= BIT_ULL(flag)) #define cap_lower(c, flag) ((c).val &= ~BIT_ULL(flag)) #define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0) static inline kernel_cap_t cap_combine(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val | b.val }; } static inline kernel_cap_t cap_intersect(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val & b.val }; } static inline kernel_cap_t cap_drop(const kernel_cap_t a, const kernel_cap_t drop) { return (kernel_cap_t) { a.val &~ drop.val }; } static inline bool cap_isclear(const kernel_cap_t a) { return !a.val; } static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b) { return a.val == b.val; } /* * Check if "a" is a subset of "set". * return true if ALL of the capabilities in "a" are also in "set" * cap_issubset(0101, 1111) will return true * return false if ANY of the capabilities in "a" are not in "set" * cap_issubset(1111, 0101) will return false */ static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { return !(a.val & ~set.val); } /* Used to decide between falling back on the old suser() or fsuser(). */ static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { return cap_drop(a, CAP_FS_SET); } static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_FS_SET)); } static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { return cap_drop(a, CAP_NFSD_SET); } static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET)); } #ifdef CONFIG_MULTIUSER extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); #else static inline bool has_capability(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool has_capability_noaudit(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool capable(int cap) { return true; } static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_setid(struct user_namespace *ns, int cap) { return true; } #endif /* CONFIG_MULTIUSER */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode); bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); static inline bool perfmon_capable(void) { return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } static inline bool bpf_capable(void) { return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) { return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN); } /* audit system wants to get cap info from files as well */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */
2658 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM scsi #if !defined(_TRACE_SCSI_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCSI_H #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> #include <linux/tracepoint.h> #include <linux/trace_seq.h> #define scsi_opcode_name(opcode) { opcode, #opcode } #define show_opcode_name(val) \ __print_symbolic(val, \ scsi_opcode_name(TEST_UNIT_READY), \ scsi_opcode_name(REZERO_UNIT), \ scsi_opcode_name(REQUEST_SENSE), \ scsi_opcode_name(FORMAT_UNIT), \ scsi_opcode_name(READ_BLOCK_LIMITS), \ scsi_opcode_name(REASSIGN_BLOCKS), \ scsi_opcode_name(INITIALIZE_ELEMENT_STATUS), \ scsi_opcode_name(READ_6), \ scsi_opcode_name(WRITE_6), \ scsi_opcode_name(SEEK_6), \ scsi_opcode_name(READ_REVERSE), \ scsi_opcode_name(WRITE_FILEMARKS), \ scsi_opcode_name(SPACE), \ scsi_opcode_name(INQUIRY), \ scsi_opcode_name(RECOVER_BUFFERED_DATA), \ scsi_opcode_name(MODE_SELECT), \ scsi_opcode_name(RESERVE), \ scsi_opcode_name(RELEASE), \ scsi_opcode_name(COPY), \ scsi_opcode_name(ERASE), \ scsi_opcode_name(MODE_SENSE), \ scsi_opcode_name(START_STOP), \ scsi_opcode_name(RECEIVE_DIAGNOSTIC), \ scsi_opcode_name(SEND_DIAGNOSTIC), \ scsi_opcode_name(ALLOW_MEDIUM_REMOVAL), \ scsi_opcode_name(SET_WINDOW), \ scsi_opcode_name(READ_CAPACITY), \ scsi_opcode_name(READ_10), \ scsi_opcode_name(WRITE_10), \ scsi_opcode_name(SEEK_10), \ scsi_opcode_name(POSITION_TO_ELEMENT), \ scsi_opcode_name(WRITE_VERIFY), \ scsi_opcode_name(VERIFY), \ scsi_opcode_name(SEARCH_HIGH), \ scsi_opcode_name(SEARCH_EQUAL), \ scsi_opcode_name(SEARCH_LOW), \ scsi_opcode_name(SET_LIMITS), \ scsi_opcode_name(PRE_FETCH), \ scsi_opcode_name(READ_POSITION), \ scsi_opcode_name(SYNCHRONIZE_CACHE), \ scsi_opcode_name(LOCK_UNLOCK_CACHE), \ scsi_opcode_name(READ_DEFECT_DATA), \ scsi_opcode_name(MEDIUM_SCAN), \ scsi_opcode_name(COMPARE), \ scsi_opcode_name(COPY_VERIFY), \ scsi_opcode_name(WRITE_BUFFER), \ scsi_opcode_name(READ_BUFFER), \ scsi_opcode_name(UPDATE_BLOCK), \ scsi_opcode_name(READ_LONG), \ scsi_opcode_name(WRITE_LONG), \ scsi_opcode_name(CHANGE_DEFINITION), \ scsi_opcode_name(WRITE_SAME), \ scsi_opcode_name(UNMAP), \ scsi_opcode_name(READ_TOC), \ scsi_opcode_name(LOG_SELECT), \ scsi_opcode_name(LOG_SENSE), \ scsi_opcode_name(XDWRITEREAD_10), \ scsi_opcode_name(MODE_SELECT_10), \ scsi_opcode_name(RESERVE_10), \ scsi_opcode_name(RELEASE_10), \ scsi_opcode_name(MODE_SENSE_10), \ scsi_opcode_name(PERSISTENT_RESERVE_IN), \ scsi_opcode_name(PERSISTENT_RESERVE_OUT), \ scsi_opcode_name(VARIABLE_LENGTH_CMD), \ scsi_opcode_name(REPORT_LUNS), \ scsi_opcode_name(MAINTENANCE_IN), \ scsi_opcode_name(MAINTENANCE_OUT), \ scsi_opcode_name(MOVE_MEDIUM), \ scsi_opcode_name(EXCHANGE_MEDIUM), \ scsi_opcode_name(READ_12), \ scsi_opcode_name(WRITE_12), \ scsi_opcode_name(WRITE_VERIFY_12), \ scsi_opcode_name(SEARCH_HIGH_12), \ scsi_opcode_name(SEARCH_EQUAL_12), \ scsi_opcode_name(SEARCH_LOW_12), \ scsi_opcode_name(READ_ELEMENT_STATUS), \ scsi_opcode_name(SEND_VOLUME_TAG), \ scsi_opcode_name(WRITE_LONG_2), \ scsi_opcode_name(READ_16), \ scsi_opcode_name(WRITE_16), \ scsi_opcode_name(VERIFY_16), \ scsi_opcode_name(WRITE_SAME_16), \ scsi_opcode_name(ZBC_OUT), \ scsi_opcode_name(ZBC_IN), \ scsi_opcode_name(SERVICE_ACTION_IN_16), \ scsi_opcode_name(READ_32), \ scsi_opcode_name(WRITE_32), \ scsi_opcode_name(WRITE_SAME_32), \ scsi_opcode_name(ATA_16), \ scsi_opcode_name(ATA_12)) #define scsi_hostbyte_name(result) { result, #result } #define show_hostbyte_name(val) \ __print_symbolic(val, \ scsi_hostbyte_name(DID_OK), \ scsi_hostbyte_name(DID_NO_CONNECT), \ scsi_hostbyte_name(DID_BUS_BUSY), \ scsi_hostbyte_name(DID_TIME_OUT), \ scsi_hostbyte_name(DID_BAD_TARGET), \ scsi_hostbyte_name(DID_ABORT), \ scsi_hostbyte_name(DID_PARITY), \ scsi_hostbyte_name(DID_ERROR), \ scsi_hostbyte_name(DID_RESET), \ scsi_hostbyte_name(DID_BAD_INTR), \ scsi_hostbyte_name(DID_PASSTHROUGH), \ scsi_hostbyte_name(DID_SOFT_ERROR), \ scsi_hostbyte_name(DID_IMM_RETRY), \ scsi_hostbyte_name(DID_REQUEUE), \ scsi_hostbyte_name(DID_TRANSPORT_DISRUPTED), \ scsi_hostbyte_name(DID_TRANSPORT_FAILFAST)) #define scsi_statusbyte_name(result) { result, #result } #define show_statusbyte_name(val) \ __print_symbolic(val, \ scsi_statusbyte_name(SAM_STAT_GOOD), \ scsi_statusbyte_name(SAM_STAT_CHECK_CONDITION), \ scsi_statusbyte_name(SAM_STAT_CONDITION_MET), \ scsi_statusbyte_name(SAM_STAT_BUSY), \ scsi_statusbyte_name(SAM_STAT_INTERMEDIATE), \ scsi_statusbyte_name(SAM_STAT_INTERMEDIATE_CONDITION_MET), \ scsi_statusbyte_name(SAM_STAT_RESERVATION_CONFLICT), \ scsi_statusbyte_name(SAM_STAT_COMMAND_TERMINATED), \ scsi_statusbyte_name(SAM_STAT_TASK_SET_FULL), \ scsi_statusbyte_name(SAM_STAT_ACA_ACTIVE), \ scsi_statusbyte_name(SAM_STAT_TASK_ABORTED)) #define scsi_prot_op_name(result) { result, #result } #define show_prot_op_name(val) \ __print_symbolic(val, \ scsi_prot_op_name(SCSI_PROT_NORMAL), \ scsi_prot_op_name(SCSI_PROT_READ_INSERT), \ scsi_prot_op_name(SCSI_PROT_WRITE_STRIP), \ scsi_prot_op_name(SCSI_PROT_READ_STRIP), \ scsi_prot_op_name(SCSI_PROT_WRITE_INSERT), \ scsi_prot_op_name(SCSI_PROT_READ_PASS), \ scsi_prot_op_name(SCSI_PROT_WRITE_PASS)) const char *scsi_trace_parse_cdb(struct trace_seq*, unsigned char*, int); #define __parse_cdb(cdb, len) scsi_trace_parse_cdb(p, cdb, len) TRACE_EVENT(scsi_dispatch_cmd_start, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \ " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len)) ); TRACE_EVENT(scsi_dispatch_cmd_error, TP_PROTO(struct scsi_cmnd *cmd, int rtn), TP_ARGS(cmd, rtn), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( int, rtn ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->rtn = rtn; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \ " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)" \ " rtn=%d", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len), __entry->rtn) ); DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( int, result ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( int, driver_tag) __field( int, scheduler_tag) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) __field( u8, sense_key ) __field( u8, asc ) __field( u8, ascq ) ), TP_fast_assign( struct scsi_sense_hdr sshdr; __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->result = cmd->result; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->driver_tag = scsi_cmd_to_rq(cmd)->tag; __entry->scheduler_tag = scsi_cmd_to_rq(cmd)->internal_tag; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); if (cmd->sense_buffer && SCSI_SENSE_VALID(cmd) && scsi_command_normalize_sense(cmd, &sshdr)) { __entry->sense_key = sshdr.sense_key; __entry->asc = sshdr.asc; __entry->ascq = sshdr.ascq; } else { __entry->sense_key = 0; __entry->asc = 0; __entry->ascq = 0; } ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u " \ "prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s) " \ "result=(driver=%s host=%s message=%s status=%s) " "sense=(key=%#x asc=%#x ascq=%#x)", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), __entry->driver_tag, __entry->scheduler_tag, show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len), "DRIVER_OK", show_hostbyte_name(((__entry->result) >> 16) & 0xff), "COMMAND_COMPLETE", show_statusbyte_name(__entry->result & 0xff), __entry->sense_key, __entry->asc, __entry->ascq) ); DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_done, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd)); DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_timeout, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd)); TRACE_EVENT(scsi_eh_wakeup, TP_PROTO(struct Scsi_Host *shost), TP_ARGS(shost), TP_STRUCT__entry( __field( unsigned int, host_no ) ), TP_fast_assign( __entry->host_no = shost->host_no; ), TP_printk("host_no=%u", __entry->host_no) ); #endif /* _TRACE_SCSI_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
25 25 3 7 37 37 37 4 4 2 2 2 48 47 48 48 43 47 35 35 37 25 25 25 12 12 48 2 2 27 27 1 1 1 5 3 1 3 3 5 5 5 65 64 64 50 26 26 26 25 1 12 12 1 15 15 1468 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "soft-interface.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" #include "send.h" #include "translation-table.h" /** * batadv_skb_head_push() - Increase header size and move (push) head pointer * @skb: packet buffer which should be modified * @len: number of bytes to add * * Return: 0 on success or negative error number in case of failure */ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; /* TODO: We must check if we can release all references to non-payload * data using __skb_header_release in our skbs to allow skb_cow_header * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. */ result = skb_cow_head(skb, len); if (result < 0) return result; skb_push(skb, len); return 0; } static int batadv_interface_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static int batadv_interface_release(struct net_device *dev) { netif_stop_queue(dev); return 0; } /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the soft interface information * @idx: index of counter to sum up * * Return: sum of all cpu-local counters */ static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { counters = per_cpu_ptr(bat_priv->bat_counters, cpu); sum += counters[idx]; } return sum; } static struct net_device_stats *batadv_interface_stats(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX); stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES); stats->tx_dropped = batadv_sum_counter(bat_priv, BATADV_CNT_TX_DROPPED); stats->rx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_RX); stats->rx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_RX_BYTES); return stats; } static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); eth_hw_addr_set(dev, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) return 0; rcu_read_lock(); hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, vlan->vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); } rcu_read_unlock(); return 0; } static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { struct batadv_priv *bat_priv = netdev_priv(dev); /* check ranges */ if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; bat_priv->mtu_set_by_user = new_mtu; return 0; } /** * batadv_interface_set_rx_mode() - set the rx mode of a device * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman * soft interface. However a dummy handler enables a user to set static * multicast listeners for instance. */ static void batadv_interface_set_rx_mode(struct net_device *dev) { } static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, struct net_device *soft_iface) { struct ethhdr *ethhdr; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 0; bool do_bcast = false, client_added; unsigned short vid; u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST; int mcast_is_routable = 0; int network_offset = ETH_HLEN; __be16 proto; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; switch (ntohs(proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, sizeof(*vhdr))) goto dropped; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; /* drop batman-in-batman packets to prevent loops */ if (proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } fallthrough; case ETH_P_BATMAN: goto dropped; } skb_set_network_header(skb, network_offset); if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; /* skb->data might have been reallocated by batadv_bla_tx() */ ethhdr = eth_hdr(skb); /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source) && !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); if (!client_added) goto dropped; } /* Snoop address candidates from DHCPACKs for early DAT filling */ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid); /* don't accept stp packets. STP does not help in meshes. * better use the bridge loop avoidance ... * * The same goes for ECTP sent at least by some Cisco Switches, * it might confuse the mesh when used with bridge loop avoidance. */ if (batadv_compare_eth(ethhdr->h_dest, stp_addr)) goto dropped; if (batadv_compare_eth(ethhdr->h_dest, ectp_addr)) goto dropped; gw_mode = atomic_read(&bat_priv->gw.mode); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* if gw mode is off, broadcast every packet */ if (gw_mode == BATADV_GW_MODE_OFF) { do_bcast = true; goto send; } dhcp_rcp = batadv_gw_dhcp_recipient_get(skb, &header_len, chaddr); /* skb->data may have been modified by * batadv_gw_dhcp_recipient_get() */ ethhdr = eth_hdr(skb); /* if gw_mode is on, broadcast any non-DHCP message. * All the DHCP packets are going to be sent as unicast */ if (dhcp_rcp == BATADV_DHCP_NO) { do_bcast = true; goto send; } if (dhcp_rcp == BATADV_DHCP_TO_CLIENT) dst_hint = chaddr; else if ((gw_mode == BATADV_GW_MODE_SERVER) && (dhcp_rcp == BATADV_DHCP_TO_SERVER)) /* gateways should not forward any DHCP message if * directed to a DHCP server */ goto dropped; send: if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) { forw_mode = batadv_mcast_forw_mode(bat_priv, skb, vid, &mcast_is_routable); switch (forw_mode) { case BATADV_FORW_BCAST: break; case BATADV_FORW_UCASTS: case BATADV_FORW_MCAST: do_bcast = false; break; case BATADV_FORW_NONE: fallthrough; default: goto dropped; } } } batadv_skb_set_priority(skb, 0); /* ethernet packet should be broadcasted */ if (do_bcast) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto dropped; /* in case of ARP request, we do not immediately broadcasti the * packet, instead we first wait for DAT to try to retrieve the * correct ARP entry */ if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) brd_delay = msecs_to_jiffies(ARP_REQ_DELAY); if (batadv_skb_head_push(skb, sizeof(*bcast_packet)) < 0) goto dropped; bcast_packet = (struct batadv_bcast_packet *)skb->data; bcast_packet->version = BATADV_COMPAT_VERSION; bcast_packet->ttl = BATADV_TTL - 1; /* batman packet type: broadcast */ bcast_packet->packet_type = BATADV_BCAST; bcast_packet->reserved = 0; /* hw address of first interface is the orig mac because only * this mac is known throughout the mesh */ ether_addr_copy(bcast_packet->orig, primary_if->net_dev->dev_addr); /* set broadcast sequence number */ seqno = atomic_inc_return(&bat_priv->bcast_seqno); bcast_packet->seqno = htonl(seqno); batadv_send_bcast_packet(bat_priv, skb, brd_delay, true); /* unicast packet */ } else { /* DHCP packets going to a server will use the GW feature */ if (dhcp_rcp == BATADV_DHCP_TO_SERVER) { ret = batadv_gw_out_of_range(bat_priv, skb); if (ret) goto dropped; ret = batadv_send_skb_via_gw(bat_priv, skb, vid); } else if (forw_mode == BATADV_FORW_UCASTS) { ret = batadv_mcast_forw_send(bat_priv, skb, vid, mcast_is_routable); } else if (forw_mode == BATADV_FORW_MCAST) { ret = batadv_mcast_forw_mcsend(bat_priv, skb); } else { if (batadv_dat_snoop_outgoing_arp_request(bat_priv, skb)) goto dropped; batadv_dat_snoop_outgoing_arp_reply(bat_priv, skb); ret = batadv_send_skb_via_tt(bat_priv, skb, dst_hint, vid); } if (ret != NET_XMIT_SUCCESS) goto dropped_freed; } batadv_inc_counter(bat_priv, BATADV_CNT_TX); batadv_add_counter(bat_priv, BATADV_CNT_TX_BYTES, data_len); goto end; dropped: kfree_skb(skb); dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: batadv_hardif_put(primary_if); return NETDEV_TX_OK; } /** * batadv_interface_rx() - receive ethernet frame on local batman-adv interface * @soft_iface: local interface which will receive the ethernet frame * @skb: ethernet frame for @soft_iface * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * * Sends an ethernet frame to the receive path of the local @soft_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. * * The packet may still get dropped. This can happen when the encapsulated * ethernet frame is invalid or contains again an batman-adv packet. Also * unicast packets will be dropped directly when it was sent between two * isolated clients. */ void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; int packet_type; batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; packet_type = batadv_bcast_packet->packet_type; skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); /* clean the netfilter state now that the batman-adv header has been * removed */ nf_reset_ct(skb); if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) goto dropped; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: if (!pskb_may_pull(skb, VLAN_ETH_HLEN)) goto dropped; vhdr = skb_vlan_eth_hdr(skb); /* drop batman-in-batman packets to prevent loops */ if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; fallthrough; case ETH_P_BATMAN: goto dropped; } /* skb->dev & skb->pkt_type are set here */ skb->protocol = eth_type_trans(skb, soft_iface); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); /* Let the bridge loop avoidance check the packet. If will * not handle it, we can safely push it up. */ if (batadv_bla_rx(bat_priv, skb, vid, packet_type)) goto out; if (orig_node) batadv_tt_add_temporary_global_entry(bat_priv, orig_node, ethhdr->h_source, vid); if (is_multicast_ether_addr(ethhdr->h_dest)) { /* set the mark on broadcast packets if AP isolation is ON and * the packet is coming from an "isolated" client */ if (batadv_vlan_ap_isola_get(bat_priv, vid) && batadv_tt_global_is_isolated(bat_priv, ethhdr->h_source, vid)) { /* save bits in skb->mark not covered by the mask and * apply the mark on the rest */ skb->mark &= ~bat_priv->isolation_mark_mask; skb->mark |= bat_priv->isolation_mark; } } else if (batadv_is_ap_isolated(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid)) { goto dropped; } netif_rx(skb); goto out; dropped: kfree_skb(skb); out: return; } /** * batadv_softif_vlan_release() - release vlan from lists and queue for free * after rcu grace period * @ref: kref pointer of the vlan object */ void batadv_softif_vlan_release(struct kref *ref) { struct batadv_softif_vlan *vlan; vlan = container_of(ref, struct batadv_softif_vlan, refcount); spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); hlist_del_rcu(&vlan->list); spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); kfree_rcu(vlan, rcu); } /** * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve * * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan_tmp, *vlan = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) { if (vlan_tmp->vid != vid) continue; if (!kref_get_unless_zero(&vlan_tmp->refcount)) continue; vlan = vlan_tmp; break; } rcu_read_unlock(); return vlan; } /** * batadv_softif_create_vlan() - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_softif_vlan *vlan; spin_lock_bh(&bat_priv->softif_vlan_list_lock); vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); if (!vlan) { spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; } vlan->bat_priv = bat_priv; vlan->vid = vid; kref_init(&vlan->refcount); atomic_set(&vlan->ap_isolation, 0); kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); /* don't return reference to new softif_vlan */ batadv_softif_vlan_put(vlan); return 0; } /** * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object * @bat_priv: the bat priv with all the soft interface information * @vlan: the object to remove */ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, struct batadv_softif_vlan *vlan) { /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); batadv_softif_vlan_put(vlan); } /** * batadv_interface_add_vid() - ndo_add_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the new vlan * * Set up all the internal structures for handling the new vlan on top of the * mesh interface * * Return: 0 on success or a negative error code in case of failure. */ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; vid |= BATADV_VLAN_HAS_TAG; /* if a new vlan is getting created and it already exists, it means that * it was not deleted yet. batadv_softif_vlan_get() increases the * refcount in order to revive the object. * * if it does not exist then create it. */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (!vlan) return batadv_softif_create_vlan(bat_priv, vid); /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ batadv_tt_local_add(bat_priv->soft_iface, bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; } /** * batadv_interface_kill_vid() - ndo_kill_vid API implementation * @dev: the netdev of the mesh interface * @proto: protocol of the vlan id * @vid: identifier of the deleted vlan * * Destroy all the internal structures used to handle the vlan identified by vid * on top of the mesh interface * * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q * or -ENOENT if the specified vlan id wasn't registered. */ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; /* only 802.1Q vlans are supported. batman-adv does not know how to * handle other types */ if (proto != htons(ETH_P_8021Q)) return -EINVAL; vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return -ENOENT; batadv_softif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ batadv_softif_vlan_put(vlan); return 0; } /* batman-adv network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue * @dev: device which owns the tx queue * @txq: tx queue to modify * @_unused: always NULL */ static void batadv_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); } /** * batadv_set_lockdep_class() - Set txq and addr_list lockdep class * @dev: network device to modify */ static void batadv_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify * * Return: error code on failures */ static int batadv_softif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; u32 random_seqno; int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; atomic_set(&bat_priv->aggregated_ogms, 1); atomic_set(&bat_priv->bonding, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bridge_loop_avoidance, 1); #endif #ifdef CONFIG_BATMAN_ADV_DAT atomic_set(&bat_priv->distributed_arp_table, 1); #endif #ifdef CONFIG_BATMAN_ADV_MCAST atomic_set(&bat_priv->multicast_mode, 1); atomic_set(&bat_priv->multicast_fanout, 16); atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0); atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0); atomic_set(&bat_priv->mcast.num_no_mc_ptype_capa, 0); #endif atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF); atomic_set(&bat_priv->gw.bandwidth_down, 100); atomic_set(&bat_priv->gw.bandwidth_up, 20); atomic_set(&bat_priv->orig_interval, 1000); atomic_set(&bat_priv->hop_penalty, 30); #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_set(&bat_priv->log_level, 0); #endif atomic_set(&bat_priv->fragmentation, 1); atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); atomic_set(&bat_priv->bcast_seqno, 1); atomic_set(&bat_priv->tt.vn, 0); atomic_set(&bat_priv->tt.local_changes, 0); atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif atomic_set(&bat_priv->tp_num, 0); bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; bat_priv->isolation_mark_mask = 0; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&bat_priv->frag_seqno, random_seqno); bat_priv->primary_if = NULL; batadv_nc_init_bat_priv(bat_priv); if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) goto free_bat_counters; } ret = batadv_mesh_init(dev); if (ret < 0) goto free_bat_counters; return 0; free_bat_counters: free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; return ret; } /** * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev); out: batadv_hardif_put(hard_iface); return ret; } /** * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface * @dev: batadv_soft_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_del(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface != dev) goto out; batadv_hardif_disable_interface(hard_iface); ret = 0; out: batadv_hardif_put(hard_iface); return ret; } static const struct net_device_ops batadv_netdev_ops = { .ndo_init = batadv_softif_init_late, .ndo_open = batadv_interface_open, .ndo_stop = batadv_interface_release, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, .ndo_set_mac_address = batadv_interface_set_mac_addr, .ndo_change_mtu = batadv_interface_change_mtu, .ndo_set_rx_mode = batadv_interface_set_rx_mode, .ndo_start_xmit = batadv_interface_tx, .ndo_validate_addr = eth_validate_addr, .ndo_add_slave = batadv_softif_slave_add, .ndo_del_slave = batadv_softif_slave_del, }; static void batadv_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, "batman", sizeof(info->bus_info)); } /* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 * Declare each description string in struct.name[] to get fixed sized buffer * and compile time checking for strings longer than ETH_GSTRING_LEN. */ static const struct { const char name[ETH_GSTRING_LEN]; } batadv_counters_strings[] = { { "tx" }, { "tx_bytes" }, { "tx_dropped" }, { "rx" }, { "rx_bytes" }, { "forward" }, { "forward_bytes" }, { "mgmt_tx" }, { "mgmt_tx_bytes" }, { "mgmt_rx" }, { "mgmt_rx_bytes" }, { "frag_tx" }, { "frag_tx_bytes" }, { "frag_rx" }, { "frag_rx_bytes" }, { "frag_fwd" }, { "frag_fwd_bytes" }, { "tt_request_tx" }, { "tt_request_rx" }, { "tt_response_tx" }, { "tt_response_rx" }, { "tt_roam_adv_tx" }, { "tt_roam_adv_rx" }, #ifdef CONFIG_BATMAN_ADV_MCAST { "mcast_tx" }, { "mcast_tx_bytes" }, { "mcast_tx_local" }, { "mcast_tx_local_bytes" }, { "mcast_rx" }, { "mcast_rx_bytes" }, { "mcast_rx_local" }, { "mcast_rx_local_bytes" }, { "mcast_fwd" }, { "mcast_fwd_bytes" }, #endif #ifdef CONFIG_BATMAN_ADV_DAT { "dat_get_tx" }, { "dat_get_rx" }, { "dat_put_tx" }, { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif #ifdef CONFIG_BATMAN_ADV_NC { "nc_code" }, { "nc_code_bytes" }, { "nc_recode" }, { "nc_recode_bytes" }, { "nc_buffer" }, { "nc_decode" }, { "nc_decode_bytes" }, { "nc_decode_failed" }, { "nc_sniffed" }, #endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, sizeof(batadv_counters_strings)); } static void batadv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; for (i = 0; i < BATADV_CNT_NUM; i++) data[i] = batadv_sum_counter(bat_priv, i); } static int batadv_get_sset_count(struct net_device *dev, int stringset) { if (stringset == ETH_SS_STATS) return BATADV_CNT_NUM; return -EOPNOTSUPP; } static const struct ethtool_ops batadv_ethtool_ops = { .get_drvinfo = batadv_get_drvinfo, .get_link = ethtool_op_get_link, .get_strings = batadv_get_strings, .get_ethtool_stats = batadv_get_ethtool_stats, .get_sset_count = batadv_get_sset_count, }; /** * batadv_softif_free() - Deconstructor of batadv_soft_interface * @dev: Device to cleanup and remove */ static void batadv_softif_free(struct net_device *dev) { batadv_mesh_free(dev); /* some scheduled RCU callbacks need the bat_priv struct to accomplish * their tasks. Wait for them all to be finished before freeing the * netdev and its private data (bat_priv) */ rcu_barrier(); } /** * batadv_softif_init_early() - early stage initialization of soft interface * @dev: registered network device to modify */ static void batadv_softif_init_early(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; dev->features |= NETIF_F_LLTX; dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; /* generate random address */ eth_hw_addr_random(dev); dev->ethtool_ops = &batadv_ethtool_ops; } /** * batadv_softif_validate() - validate configuration of new batadv link * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_algo_ops *algo_ops; if (!data) return 0; if (data[IFLA_BATADV_ALGO_NAME]) { algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME])); if (!algo_ops) return -EINVAL; } return 0; } /** * batadv_softif_newlink() - pre-initialize and register new batadv link * @src_net: the applicable net namespace * @dev: network device to register * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); const char *algo_name; int err; if (data && data[IFLA_BATADV_ALGO_NAME]) { algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]); err = batadv_algo_select(bat_priv, algo_name); if (err) return -EINVAL; } return register_netdevice(dev); } /** * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via * netlink * @soft_iface: the to-be-removed batman-adv interface * @head: list pointer */ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, struct list_head *head) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *hard_iface; struct batadv_softif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { batadv_softif_destroy_vlan(bat_priv, vlan); batadv_softif_vlan_put(vlan); } unregister_netdevice_queue(soft_iface, head); } /** * batadv_softif_is_valid() - Check whether device is a batadv soft interface * @net_dev: device which should be checked * * Return: true when net_dev is a batman-adv interface, false otherwise */ bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) return true; return false; } static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING }, }; struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), .setup = batadv_softif_init_early, .maxtype = IFLA_BATADV_MAX, .policy = batadv_ifla_policy, .validate = batadv_softif_validate, .newlink = batadv_softif_newlink, .dellink = batadv_softif_destroy_netlink, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_CLS_H #define __NET_PKT_CLS_H #include <linux/pkt_cls.h> #include <linux/workqueue.h> #include <net/sch_generic.h> #include <net/act_api.h> #include <net/net_namespace.h> /* TC action not accessible from user space */ #define TC_ACT_CONSUMED (TC_ACT_VALUE_MAX + 1) /* Basic packet classifier frontend definitions. */ struct tcf_walker { int stop; int skip; int count; bool nonempty; unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; int register_tcf_proto_ops(struct tcf_proto_ops *ops); void unregister_tcf_proto_ops(struct tcf_proto_ops *ops); struct tcf_block_ext_info { enum flow_block_binder_type binder_type; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; u32 block_index; }; struct tcf_qevent { struct tcf_block *block; struct tcf_block_ext_info info; struct tcf_proto __rcu *filter_chain; }; struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); void tcf_chain_put_by_act(struct tcf_chain *chain); struct tcf_chain *tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain); struct tcf_proto *tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack); int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack); void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool used_action_miss); static inline bool tcf_block_shared(struct tcf_block *block) { return block->index; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return block && block->index; } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { WARN_ON(tcf_block_shared(block)); return block->q; } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); static inline bool tc_cls_stats_dump(struct tcf_proto *tp, struct tcf_walker *arg, void *filter) { if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #else static inline bool tcf_block_shared(struct tcf_block *block) { return false; } static inline bool tcf_block_non_null_shared(struct tcf_block *block) { return false; } static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { return 0; } static inline int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_block_put(struct tcf_block *block) { } static inline void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; } static inline int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { return TC_ACT_UNSPEC; } #endif static inline unsigned long __cls_set_class(unsigned long *clp, unsigned long cl) { return xchg(clp, cl); } static inline void tcf_set_drop_reason(struct tcf_result *res, enum skb_drop_reason reason) { res->drop_reason = reason; } static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { unsigned long cl; cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); cl = __cls_set_class(&r->class, cl); if (cl) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { struct Qdisc *q = tp->chain->block->q; /* Check q as it is not set for shared blocks. In that case, * setting class is not supported. */ if (!q) return; sch_tree_lock(q); __tcf_bind_filter(q, r, base); sch_tree_unlock(q); } static inline void __tcf_unbind_filter(struct Qdisc *q, struct tcf_result *r) { unsigned long cl; if ((cl = __cls_set_class(&r->class, 0)) != 0) q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { struct Qdisc *q = tp->chain->block->q; if (!q) return; __tcf_unbind_filter(q, r); } static inline void tc_cls_bind_class(u32 classid, unsigned long cl, void *q, struct tcf_result *res, unsigned long base) { if (res->classid == classid) { if (cl) __tcf_bind_filter(q, res, base); else __tcf_unbind_filter(q, res); } } struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; struct net *net; netns_tracker ns_tracker; struct tcf_exts_miss_cookie_node *miss_cookie_node; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */ int action; int police; }; static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { #ifdef CONFIG_NET_CLS return tcf_exts_init_ex(exts, net, action, police, NULL, 0, false); #else return -EOPNOTSUPP; #endif } /* Return false if the netns is being destroyed in cleanup_net(). Callers * need to do cleanup synchronously in this case, otherwise may race with * tc_action_net_exit(). Return true for other cases. */ static inline bool tcf_exts_get_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT exts->net = maybe_get_net(exts->net); if (exts->net) netns_tracker_alloc(exts->net, &exts->ns_tracker, GFP_KERNEL); return exts->net != NULL; #else return true; #endif } static inline void tcf_exts_put_net(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT if (exts->net) put_net_track(exts->net, &exts->ns_tracker); #endif } #ifdef CONFIG_NET_CLS_ACT #define tcf_exts_for_each_action(i, a, exts) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = (exts)->actions[i]); i++) #else #define tcf_exts_for_each_action(i, a, exts) \ for (; 0; (void)(i), (void)(a), (void)(exts)) #endif #define tcf_act_for_each_action(i, a, actions) \ for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) static inline bool tc_act_in_hw(struct tc_action *act) { return !!act->in_hw_count; } static inline void tcf_exts_hw_stats_update(const struct tcf_exts *exts, struct flow_stats *stats, bool use_act_stats) { #ifdef CONFIG_NET_CLS_ACT int i; for (i = 0; i < exts->nr_actions; i++) { struct tc_action *a = exts->actions[i]; if (use_act_stats || tc_act_in_hw(a)) { if (!tcf_action_update_hw_stats(a)) continue; } preempt_disable(); tcf_action_stats_update(a, stats->bytes, stats->pkts, stats->drops, stats->lastused, true); preempt_enable(); a->used_hw_stats = stats->used_hw_stats; a->used_hw_stats_valid = stats->used_hw_stats_valid; } #endif } /** * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * * Returns true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT return exts->nr_actions; #else return false; #endif } /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer * @exts: tc filter extensions handle * @res: desired result * * Executes all configured extensions. Returns TC_ACT_OK on a normal execution, * a negative number if the filter must be considered unmatched or * a positive action code (TC_ACT_*) which must be returned to the * underlying layer. */ static inline int tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions, exts->nr_actions, res); #endif return TC_ACT_OK; } static inline int tcf_exts_exec_ex(struct sk_buff *skb, struct tcf_exts *exts, int act_index, struct tcf_result *res) { #ifdef CONFIG_NET_CLS_ACT return tcf_action_exec(skb, exts->actions + act_index, exts->nr_actions - act_index, res); #else return TC_ACT_OK; #endif } int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); /** * struct tcf_pkt_info - packet information * * @ptr: start of the pkt data * @nexthdr: offset of the next header */ struct tcf_pkt_info { unsigned char * ptr; int nexthdr; }; #ifdef CONFIG_NET_EMATCH struct tcf_ematch_ops; /** * struct tcf_ematch - extended match (ematch) * * @matchid: identifier to allow userspace to reidentify a match * @flags: flags specifying attributes and the relation to other matches * @ops: the operations lookup table of the corresponding ematch module * @datalen: length of the ematch specific configuration data * @data: ematch specific data * @net: the network namespace */ struct tcf_ematch { struct tcf_ematch_ops * ops; unsigned long data; unsigned int datalen; u16 matchid; u16 flags; struct net *net; }; static inline int tcf_em_is_container(struct tcf_ematch *em) { return !em->ops; } static inline int tcf_em_is_simple(struct tcf_ematch *em) { return em->flags & TCF_EM_SIMPLE; } static inline int tcf_em_is_inverted(struct tcf_ematch *em) { return em->flags & TCF_EM_INVERT; } static inline int tcf_em_last_match(struct tcf_ematch *em) { return (em->flags & TCF_EM_REL_MASK) == TCF_EM_REL_END; } static inline int tcf_em_early_end(struct tcf_ematch *em, int result) { if (tcf_em_last_match(em)) return 1; if (result == 0 && em->flags & TCF_EM_REL_AND) return 1; if (result != 0 && em->flags & TCF_EM_REL_OR) return 1; return 0; } /** * struct tcf_ematch_tree - ematch tree handle * * @hdr: ematch tree header supplied by userspace * @matches: array of ematches */ struct tcf_ematch_tree { struct tcf_ematch_tree_hdr hdr; struct tcf_ematch * matches; }; /** * struct tcf_ematch_ops - ematch module operations * * @kind: identifier (kind) of this ematch module * @datalen: length of expected configuration data (optional) * @change: called during validation (optional) * @match: called during ematch tree evaluation, must return 1/0 * @destroy: called during destroyage (optional) * @dump: called during dumping process (optional) * @owner: owner, must be set to THIS_MODULE * @link: link to previous/next ematch module (internal use) */ struct tcf_ematch_ops { int kind; int datalen; int (*change)(struct net *net, void *, int, struct tcf_ematch *); int (*match)(struct sk_buff *, struct tcf_ematch *, struct tcf_pkt_info *); void (*destroy)(struct tcf_ematch *); int (*dump)(struct sk_buff *, struct tcf_ematch *); struct module *owner; struct list_head link; }; int tcf_em_register(struct tcf_ematch_ops *); void tcf_em_unregister(struct tcf_ematch_ops *); int tcf_em_tree_validate(struct tcf_proto *, struct nlattr *, struct tcf_ematch_tree *); void tcf_em_tree_destroy(struct tcf_ematch_tree *); int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int); int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, struct tcf_pkt_info *); /** * tcf_em_tree_match - evaulate an ematch tree * * @skb: socket buffer of the packet in question * @tree: ematch tree to be used for evaluation * @info: packet information examined by classifier * * This function matches @skb against the ematch tree in @tree by going * through all ematches respecting their logic relations returning * as soon as the result is obvious. * * Returns 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { if (tree->hdr.nmatches) return __tcf_em_tree_match(skb, tree, info); else return 1; } #define MODULE_ALIAS_TCF_EMATCH(kind) MODULE_ALIAS("ematch-kind-" __stringify(kind)) #else /* CONFIG_NET_EMATCH */ struct tcf_ematch_tree { }; #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0) #define tcf_em_tree_destroy(t) do { (void)(t); } while(0) #define tcf_em_tree_dump(skb, t, tlv) (0) #define tcf_em_tree_match(skb, t, info) ((void)(info), 1) #endif /* CONFIG_NET_EMATCH */ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: return skb_transport_header(skb); } return NULL; } static inline int tcf_valid_offset(const struct sk_buff *skb, const unsigned char *ptr, const int len) { return likely((ptr + len) <= skb_tail_pointer(skb) && ptr >= skb->head && (ptr <= (ptr + len))); } static inline int tcf_change_indev(struct net *net, struct nlattr *indev_tlv, struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; } dev = __dev_get_by_name(net, indev); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Network device not found"); return -ENODEV; } return dev->ifindex; } static inline bool tcf_match_indev(struct sk_buff *skb, int ifindex) { if (!ifindex) return true; if (!skb->skb_iif) return false; return ifindex == skb->skb_iif; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack); void tc_cleanup_offload_action(struct flow_action *flow_action); int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held); int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count); unsigned int tcf_exts_num_actions(struct tcf_exts *exts); #ifdef CONFIG_NET_CLS_ACT int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe); #else static inline int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { } static inline int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { return 0; } static inline struct sk_buff * tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { return skb; } static inline int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { return 0; } #endif struct tc_cls_u32_knode { struct tcf_exts *exts; struct tcf_result *res; struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; u8 fshift; }; struct tc_cls_u32_hnode { u32 handle; u32 prio; unsigned int divisor; }; enum tc_clsu32_command { TC_CLSU32_NEW_KNODE, TC_CLSU32_REPLACE_KNODE, TC_CLSU32_DELETE_KNODE, TC_CLSU32_NEW_HNODE, TC_CLSU32_REPLACE_HNODE, TC_CLSU32_DELETE_HNODE, }; struct tc_cls_u32_offload { struct flow_cls_common_offload common; /* knode values */ enum tc_clsu32_command command; union { struct tc_cls_u32_knode knode; struct tc_cls_u32_hnode hnode; }; }; static inline bool tc_can_offload(const struct net_device *dev) { return dev->features & NETIF_F_HW_TC; } static inline bool tc_can_offload_extack(const struct net_device *dev, struct netlink_ext_ack *extack) { bool can = tc_can_offload(dev); if (!can) NL_SET_ERR_MSG(extack, "TC offload is disabled on net device"); return can; } static inline bool tc_cls_can_offload_and_chain0(const struct net_device *dev, struct flow_cls_common_offload *common) { if (!tc_can_offload_extack(dev, common->extack)) return false; if (common->chain_index) { NL_SET_ERR_MSG(common->extack, "Driver supports only offload of chain 0"); return false; } return true; } static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static inline bool tc_flags_valid(u32 flags) { if (flags & ~(TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW | TCA_CLS_FLAGS_VERBOSE)) return false; flags &= TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW; if (!(flags ^ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW))) return false; return true; } static inline bool tc_in_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false; } static inline void tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { struct tc_skb_ext *tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); if (tc_skb_ext) memset(tc_skb_ext, 0, sizeof(*tc_skb_ext)); return tc_skb_ext; } #endif enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct flow_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; struct flow_stats stats; bool use_act_stats; unsigned long cookie; }; enum tc_clsbpf_command { TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; struct tc_cls_bpf_offload { struct flow_cls_common_offload common; enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; struct bpf_prog *oldprog; const char *name; bool exts_integrated; }; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers */ struct tc_cookie { u8 *data; u32 len; struct rcu_head rcu; }; struct tc_qopt_offload_stats { struct gnet_stats_basic_sync *bstats; struct gnet_stats_queue *qstats; }; enum tc_mq_command { TC_MQ_CREATE, TC_MQ_DESTROY, TC_MQ_STATS, TC_MQ_GRAFT, }; struct tc_mq_opt_offload_graft_params { unsigned long queue; u32 child_handle; }; struct tc_mq_qopt_offload { enum tc_mq_command command; u32 handle; union { struct tc_qopt_offload_stats stats; struct tc_mq_opt_offload_graft_params graft_params; }; }; enum tc_htb_command { /* Root */ TC_HTB_CREATE, /* Initialize HTB offload. */ TC_HTB_DESTROY, /* Destroy HTB offload. */ /* Classes */ /* Allocate qid and create leaf. */ TC_HTB_LEAF_ALLOC_QUEUE, /* Convert leaf to inner, preserve and return qid, create new leaf. */ TC_HTB_LEAF_TO_INNER, /* Delete leaf, while siblings remain. */ TC_HTB_LEAF_DEL, /* Delete leaf, convert parent to leaf, preserving qid. */ TC_HTB_LEAF_DEL_LAST, /* TC_HTB_LEAF_DEL_LAST, but delete driver data on hardware errors. */ TC_HTB_LEAF_DEL_LAST_FORCE, /* Modify parameters of a node. */ TC_HTB_NODE_MODIFY, /* Class qdisc */ TC_HTB_LEAF_QUERY_QUEUE, /* Query qid by classid. */ }; struct tc_htb_qopt_offload { struct netlink_ext_ack *extack; enum tc_htb_command command; u32 parent_classid; u16 classid; u16 qid; u32 quantum; u64 rate; u64 ceil; u8 prio; }; #define TC_HTB_CLASSID_ROOT U32_MAX enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, TC_RED_STATS, TC_RED_XSTATS, TC_RED_GRAFT, }; struct tc_red_qopt_offload_params { u32 min; u32 max; u32 probability; u32 limit; bool is_ecn; bool is_harddrop; bool is_nodrop; struct gnet_stats_queue *qstats; }; struct tc_red_qopt_offload { enum tc_red_command command; u32 handle; u32 parent; union { struct tc_red_qopt_offload_params set; struct tc_qopt_offload_stats stats; struct red_stats *xstats; u32 child_handle; }; }; enum tc_gred_command { TC_GRED_REPLACE, TC_GRED_DESTROY, TC_GRED_STATS, }; struct tc_gred_vq_qopt_offload_params { bool present; u32 limit; u32 prio; u32 min; u32 max; bool is_ecn; bool is_harddrop; u32 probability; /* Only need backlog, see struct tc_prio_qopt_offload_params */ u32 *backlog; }; struct tc_gred_qopt_offload_params { bool grio_on; bool wred_on; unsigned int dp_cnt; unsigned int dp_def; struct gnet_stats_queue *qstats; struct tc_gred_vq_qopt_offload_params tab[MAX_DPs]; }; struct tc_gred_qopt_offload_stats { struct gnet_stats_basic_sync bstats[MAX_DPs]; struct gnet_stats_queue qstats[MAX_DPs]; struct red_stats *xstats[MAX_DPs]; }; struct tc_gred_qopt_offload { enum tc_gred_command command; u32 handle; u32 parent; union { struct tc_gred_qopt_offload_params set; struct tc_gred_qopt_offload_stats stats; }; }; enum tc_prio_command { TC_PRIO_REPLACE, TC_PRIO_DESTROY, TC_PRIO_STATS, TC_PRIO_GRAFT, }; struct tc_prio_qopt_offload_params { int bands; u8 priomap[TC_PRIO_MAX + 1]; /* At the point of un-offloading the Qdisc, the reported backlog and * qlen need to be reduced by the portion that is in HW. */ struct gnet_stats_queue *qstats; }; struct tc_prio_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_prio_qopt_offload { enum tc_prio_command command; u32 handle; u32 parent; union { struct tc_prio_qopt_offload_params replace_params; struct tc_qopt_offload_stats stats; struct tc_prio_qopt_offload_graft_params graft_params; }; }; enum tc_root_command { TC_ROOT_GRAFT, }; struct tc_root_qopt_offload { enum tc_root_command command; u32 handle; bool ingress; }; enum tc_ets_command { TC_ETS_REPLACE, TC_ETS_DESTROY, TC_ETS_STATS, TC_ETS_GRAFT, }; struct tc_ets_qopt_offload_replace_params { unsigned int bands; u8 priomap[TC_PRIO_MAX + 1]; unsigned int quanta[TCQ_ETS_MAX_BANDS]; /* 0 for strict bands. */ unsigned int weights[TCQ_ETS_MAX_BANDS]; struct gnet_stats_queue *qstats; }; struct tc_ets_qopt_offload_graft_params { u8 band; u32 child_handle; }; struct tc_ets_qopt_offload { enum tc_ets_command command; u32 handle; u32 parent; union { struct tc_ets_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; struct tc_ets_qopt_offload_graft_params graft_params; }; }; enum tc_tbf_command { TC_TBF_REPLACE, TC_TBF_DESTROY, TC_TBF_STATS, TC_TBF_GRAFT, }; struct tc_tbf_qopt_offload_replace_params { struct psched_ratecfg rate; u32 max_size; struct gnet_stats_queue *qstats; }; struct tc_tbf_qopt_offload { enum tc_tbf_command command; u32 handle; u32 parent; union { struct tc_tbf_qopt_offload_replace_params replace_params; struct tc_qopt_offload_stats stats; u32 child_handle; }; }; enum tc_fifo_command { TC_FIFO_REPLACE, TC_FIFO_DESTROY, TC_FIFO_STATS, }; struct tc_fifo_qopt_offload { enum tc_fifo_command command; u32 handle; u32 parent; union { struct tc_qopt_offload_stats stats; }; }; #ifdef CONFIG_NET_CLS_ACT DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void); void tc_skb_ext_tc_disable(void); #define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) #else /* CONFIG_NET_CLS_ACT */ static inline void tc_skb_ext_tc_enable(void) { } static inline void tc_skb_ext_tc_disable(void) { } #define tc_skb_ext_tc_enabled() false #endif #endif
82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in * a new subdirectory with this name. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for each * non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned value * will replace static permissions defined in struct attribute. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if a binary attribute is not visible. The returned * value will replace static permissions defined in * struct bin_attribute. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, struct bin_attribute *, int); struct attribute **attrs; struct bin_attribute **bin_attrs; }; /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define SYSFS_PREALLOC 010000 #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .show = _name##_show, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RW_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ .store = _name##_store, \ } #define __ATTR_WO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .store = _name##_store, \ } #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct address_space; struct bin_attribute { struct attribute attr; size_t size; void *private; struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _read, \ .write = _write, \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .read = _name##_read, \ .size = _size, \ } #define __BIN_ATTR_WO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .write = _name##_write, \ .size = _size, \ } #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) #define __BIN_ATTR_ADMIN_RO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0400 }, \ .read = _name##_read, \ .size = _size, \ } #define __BIN_ATTR_ADMIN_RW(_name, _size) \ __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) #define BIN_ATTR_ADMIN_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) #define BIN_ATTR_ADMIN_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } #endif /* _SYSFS_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 /* SPDX-License-Identifier: GPL-2.0 */ /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. */ #ifndef _LINUX_XSK_QUEUE_H #define _LINUX_XSK_QUEUE_H #include <linux/types.h> #include <linux/if_xdp.h> #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> #include "xsk.h" struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; u32 pad2 ____cacheline_aligned_in_smp; u32 flags; u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ struct xdp_rxtx_ring { struct xdp_ring ptrs; struct xdp_desc desc[] ____cacheline_aligned_in_smp; }; /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; u64 desc[] ____cacheline_aligned_in_smp; }; struct xsk_queue { u32 ring_mask; u32 nentries; u32 cached_prod; u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; u64 queue_empty_descs; size_t ring_vmalloc_size; }; struct parsed_desc { u32 mb; u32 valid; }; /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and * completion ring, the kernel is the producer and user space is the * consumer. For the Tx and fill rings, the kernel is the consumer and * user space is the producer. * * producer consumer * * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) * STORE $data LOAD $data * STORE.rel ->producer (B) STORE.rel ->consumer (D) * } * * (A) pairs with (D), and (B) pairs with (C). * * Starting with (B), it protects the data from being written after * the producer pointer. If this barrier was missing, the consumer * could observe the producer pointer being set and thus load the data * before the producer has written the new data. The consumer would in * this case load the old data. * * (C) protects the consumer from speculatively loading the data before * the producer pointer actually has been read. If we do not have this * barrier, some architectures could load old data as speculative loads * are not discarded as the CPU does not know there is a dependency * between ->producer and data. * * (A) is a control dependency that separates the load of ->consumer * from the stores of $data. In case ->consumer indicates there is no * room in the buffer to store $data we do not. The dependency will * order both of the stores after the loads. So no barrier is needed. * * (D) protects the load of the data to be observed to happen after the * store of the consumer pointer. If we did not have this memory * barrier, the producer could observe the consumer pointer being set * and overwrite the data with a new value before the consumer got the * chance to read the old value. The consumer would thus miss reading * the old entry and very likely read the new entry twice, once right * now and again after circling through the ring. */ /* The operations on the rings are the following: * * producer consumer * * RESERVE entries PEEK in the ring for entries * WRITE data into the ring READ data from the ring * SUBMIT entries RELEASE entries * * The producer reserves one or more entries in the ring. It can then * fill in these entries and finally submit them so that they can be * seen and read by the consumer. * * The consumer peeks into the ring to see if the producer has written * any new entries. If so, the consumer can then read these entries * and when it is done reading them release them back to the producer * so that the producer can use these slots to fill in new entries. * * The function names below reflect these operations. */ /* Functions that read and validate content from consumer rings. */ static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; *addr = ring->desc[idx]; } static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_cons != q->cached_prod) { __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr); return true; } return false; } static inline bool xp_unused_options_set(u32 options) { return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA); } static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 addr = desc->addr - pool->tx_metadata_len; u64 len = desc->len + pool->tx_metadata_len; u64 offset = addr & (pool->chunk_size - 1); if (!desc->len) return false; if (offset + len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt) return false; if (xp_unused_options_set(desc->options)) return false; return true; } static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; u64 len = desc->len + pool->tx_metadata_len; if (!desc->len) return false; if (len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) return false; return true; } static inline bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : xp_aligned_validate_desc(pool, desc); } static inline bool xskq_has_descs(struct xsk_queue *q) { return q->cached_cons != q->cached_prod; } static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) { if (!xp_validate_desc(pool, d)) { q->invalid_descs++; return false; } return true; } static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; return xskq_cons_is_valid_desc(q, desc, pool); } q->queue_empty_descs++; return false; } static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) { q->cached_cons += cnt; } static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, struct xdp_desc *desc, struct parsed_desc *parsed) { parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); parsed->mb = xp_mb_desc(desc); } static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; u32 total_descs = 0, nr_frags = 0; /* track first entry, if stumble upon *any* invalid descriptor, rewind * current packet that consists of frags and stop the processing */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; cached_cons++; parse_desc(q, pool, &descs[nb_entries], &parsed); if (unlikely(!parsed.valid)) break; if (likely(!parsed.mb)) { total_descs += (nr_frags + 1); nr_frags = 0; } else { nr_frags++; if (nr_frags == pool->netdev->xdp_zc_max_segs) { nr_frags = 0; break; } } nb_entries++; } cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); return total_descs; } /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) { smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ } static inline void __xskq_cons_peek(struct xsk_queue *q) { /* Refresh the local pointer */ q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ } static inline void xskq_cons_get_entries(struct xsk_queue *q) { __xskq_cons_release(q); __xskq_cons_peek(q); } static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; if (entries >= max) return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; return entries >= max ? max : entries; } static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) { return xskq_cons_nb_entries(q, cnt) >= cnt; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); return xskq_cons_read_addr_unchecked(q, addr); } static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); return xskq_cons_read_desc(q, desc, pool); } /* To improve performance in the xskq_cons_release functions, only update local state here. * Reflect this to global state when we get new entries from the ring in * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. */ static inline void xskq_cons_release(struct xsk_queue *q) { q->cached_cons++; } static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) { q->cached_cons -= cnt; } static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); } /* Functions for producers */ static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); if (free_entries >= max) return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); return free_entries >= max ? max : free_entries; } static inline bool xskq_prod_is_full(struct xsk_queue *q) { return xskq_prod_nb_free(q, 1) ? false : true; } static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) { if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ q->cached_prod++; return 0; } static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ ring->desc[q->cached_prod++ & q->ring_mask] = addr; return 0; } static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; u32 i, cached_prod; /* A, matches D */ cached_prod = q->cached_prod; for (i = 0; i < nb_entries; i++) ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; q->cached_prod = cached_prod; } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; if (xskq_prod_is_full(q)) return -ENOBUFS; /* A, matches D */ idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; ring->desc[idx].options = flags; return 0; } static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { smp_store_release(&q->ring->producer, idx); /* B, matches C */ } static inline void xskq_prod_submit(struct xsk_queue *q) { __xskq_prod_submit(q, q->cached_prod); } static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); } static inline bool xskq_prod_is_empty(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } /* For both producers and consumers */ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) { return q ? q->invalid_descs : 0; } static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) { return q ? q->queue_empty_descs : 0; } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); #endif /* _LINUX_XSK_QUEUE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Manage send buffer * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef SMC_TX_H #define SMC_TX_H #include <linux/socket.h> #include <linux/types.h> #include "smc.h" #include "smc_cdc.h" static inline int smc_tx_prepared_sends(struct smc_connection *conn) { union smc_host_cursor sent, prep; smc_curs_copy(&sent, &conn->tx_curs_sent, conn); smc_curs_copy(&prep, &conn->tx_curs_prep, conn); return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal); #endif /* SMC_TX_H */
1288 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 /* * linux/include/linux/console.h * * Copyright (C) 1993 Hamish Macdonald * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * * Changed: * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX */ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 #include <linux/atomic.h> #include <linux/bits.h> #include <linux/rculist.h> #include <linux/types.h> struct vc_data; struct console_font_op; struct console_font; struct module; struct tty_struct; struct notifier_block; enum con_scroll { SM_UP, SM_DOWN, }; enum vc_intensity; /** * struct consw - callbacks for consoles * * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. * @con_set_palette: sets the palette of the console to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) */ struct consw { struct module *owner; const char *(*con_startup)(void); void (*con_init)(struct vc_data *vc, int init); void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, int sy, int sx, int height, int width); void (*con_putc)(struct vc_data *vc, int c, int ypos, int xpos); void (*con_putcs)(struct vc_data *vc, const unsigned short *s, int count, int ypos, int xpos); void (*con_cursor)(struct vc_data *vc, int mode); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); int (*con_switch)(struct vc_data *vc); int (*con_blank)(struct vc_data *vc, int blank, int mode_switch); int (*con_font_set)(struct vc_data *vc, struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, struct console_font *font, char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, unsigned int height, unsigned int user); void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); int (*con_set_origin)(struct vc_data *vc); void (*con_save_screen)(struct vc_data *vc); u8 (*con_build_attr)(struct vc_data *vc, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); u16 *(*con_screen_pos)(const struct vc_data *vc, int offset); unsigned long (*con_getxy)(struct vc_data *vc, unsigned long position, int *px, int *py); /* * Flush the video console driver's scrollback buffer */ void (*con_flush_scrollback)(struct vc_data *vc); /* * Prepare the console for the debugger. This includes, but is not * limited to, unblanking the console, loading an appropriate * palette, and allowing debugger generated output. */ int (*con_debug_enter)(struct vc_data *vc); /* * Restore the console to its pre-debug state as closely as possible. */ int (*con_debug_leave)(struct vc_data *vc); }; extern const struct consw *conswitchp; extern const struct consw dummy_con; /* dummy console buffer */ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ struct screen_info; #ifdef CONFIG_VGA_CONSOLE void vgacon_register_screen(struct screen_info *si); #else static inline void vgacon_register_screen(struct screen_info *si) { } #endif int con_is_bound(const struct consw *csw); int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_HW_CONSOLE int con_debug_enter(struct vc_data *vc); int con_debug_leave(void); #else static inline int con_debug_enter(struct vc_data *vc) { return 0; } static inline int con_debug_leave(void) { return 0; } #endif /* cursor */ #define CM_DRAW (1) #define CM_ERASE (2) #define CM_MOVE (3) /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) */ /** * cons_flags - General console flags * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate * output of messages that were already shown by boot * consoles or read by userspace via syslog() syscall. * @CON_CONSDEV: Indicates that the console driver is backing * /dev/console. * @CON_ENABLED: Indicates if a console is allowed to print records. If * false, the console also will not advance to later * records. * @CON_BOOT: Marks the console driver as early console driver which * is used during boot before the real driver becomes * available. It will be automatically unregistered * when the real console driver is registered unless * "keep_bootcon" parameter is used. * @CON_ANYTIME: A misnomed historical flag which tells the core code * that the legacy @console::write callback can be invoked * on a CPU which is marked OFFLINE. That is misleading as * it suggests that there is no contextual limit for * invoking the callback. The original motivation was * readiness of the per-CPU areas. * @CON_BRL: Indicates a braille device which is exempt from * receiving the printk spam for obvious reasons. * @CON_EXTENDED: The console supports the extended output format of * /dev/kmesg which requires a larger output buffer. * @CON_SUSPENDED: Indicates if a console is suspended. If true, the * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. */ enum cons_flags { CON_PRINTBUFFER = BIT(0), CON_CONSDEV = BIT(1), CON_ENABLED = BIT(2), CON_BOOT = BIT(3), CON_ANYTIME = BIT(4), CON_BRL = BIT(5), CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), }; /** * struct nbcon_state - console state for nbcon consoles * @atom: Compound of the state fields for atomic operations * * @req_prio: The priority of a handover request * @prio: The priority of the current owner * @unsafe: Console is busy in a non takeover region * @unsafe_takeover: A hostile takeover in an unsafe state happened in the * past. The console cannot be safe until re-initialized. * @cpu: The CPU on which the owner runs * * To be used for reading and preparing of the value stored in the nbcon * state variable @console::nbcon_state. * * The @prio and @req_prio fields are particularly important to allow * spin-waiting to timeout and give up without the risk of a waiter being * assigned the lock after giving up. */ struct nbcon_state { union { unsigned int atom; struct { unsigned int prio : 2; unsigned int req_prio : 2; unsigned int unsafe : 1; unsigned int unsafe_takeover : 1; unsigned int cpu : 24; }; }; }; /* * The nbcon_state struct is used to easily create and interpret values that * are stored in the @console::nbcon_state variable. Ensure this struct stays * within the size boundaries of the atomic variable's underlying type in * order to avoid any accidental truncation. */ static_assert(sizeof(struct nbcon_state) <= sizeof(int)); /** * nbcon_prio - console owner priority for nbcon consoles * @NBCON_PRIO_NONE: Unused * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) * @NBCON_PRIO_PANIC: Panic output * @NBCON_PRIO_MAX: The number of priority levels * * A higher priority context can takeover the console when it is * in the safe state. The final attempt to flush consoles in panic() * can be allowed to do so even in an unsafe state (Hope and pray). */ enum nbcon_prio { NBCON_PRIO_NONE = 0, NBCON_PRIO_NORMAL, NBCON_PRIO_EMERGENCY, NBCON_PRIO_PANIC, NBCON_PRIO_MAX, }; struct console; struct printk_buffers; /** * struct nbcon_context - Context for console acquire/release * @console: The associated console * @spinwait_max_us: Limit for spin-wait acquire * @prio: Priority of the context * @allow_unsafe_takeover: Allow performing takeover even if unsafe. Can * be used only with NBCON_PRIO_PANIC @prio. It * might cause a system freeze when the console * is used later. * @backlog: Ringbuffer has pending records * @pbufs: Pointer to the text buffer for this context * @seq: The sequence number to print for this context */ struct nbcon_context { /* members set by caller */ struct console *console; unsigned int spinwait_max_us; enum nbcon_prio prio; unsigned int allow_unsafe_takeover : 1; /* members set by emit */ unsigned int backlog : 1; /* members set by acquire */ struct printk_buffers *pbufs; u64 seq; }; /** * struct nbcon_write_context - Context handed to the nbcon write callbacks * @ctxt: The core console context * @outbuf: Pointer to the text buffer for output * @len: Length to write * @unsafe_takeover: If a hostile takeover in an unsafe state has occurred */ struct nbcon_write_context { struct nbcon_context __private ctxt; char *outbuf; unsigned int len; bool unsafe_takeover; }; /** * struct console - The console descriptor structure * @name: The name of the console driver * @write: Write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) * @setup: Callback for initializing the console (Optional) * @exit: Callback for teardown of the console (Optional) * @match: Callback for matching a console (Optional) * @flags: Console flags. See enum cons_flags * @index: Console index, e.g. port number * @cflag: TTY control mode flags * @ispeed: TTY input speed * @ospeed: TTY output speed * @seq: Sequence number of the next ringbuffer record to print * @dropped: Number of unreported dropped ringbuffer records * @data: Driver private data * @node: hlist node for the console list * * @write_atomic: Write callback for atomic context * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print * @pbufs: Pointer to nbcon private buffer */ struct console { char name[16]; void (*write)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *co, char *s, unsigned int count); struct tty_driver *(*device)(struct console *co, int *index); void (*unblank)(void); int (*setup)(struct console *co, char *options); int (*exit)(struct console *co); int (*match)(struct console *co, char *name, int idx, char *options); short flags; short index; int cflag; uint ispeed; uint ospeed; u64 seq; unsigned long dropped; void *data; struct hlist_node node; /* nbcon console specific members */ bool (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct printk_buffers *pbufs; }; #ifdef CONFIG_LOCKDEP extern void lockdep_assert_console_list_lock_held(void); #else static inline void lockdep_assert_console_list_lock_held(void) { } #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC extern bool console_srcu_read_lock_is_held(void); #else static inline bool console_srcu_read_lock_is_held(void) { return 1; } #endif extern int console_srcu_read_lock(void); extern void console_srcu_read_unlock(int cookie); extern void console_list_lock(void) __acquires(console_mutex); extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** * console_srcu_read_flags - Locklessly read the console flags * @con: struct console pointer of console to read flags from * * This function provides the necessary READ_ONCE() and data_race() * notation for locklessly reading the console flags. The READ_ONCE() * in this function matches the WRITE_ONCE() when @flags are modified * for registered consoles with console_srcu_write_flags(). * * Only use this function to read console flags when locklessly * iterating the console list via srcu. * * Context: Any context. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* * Locklessly reading console->flags provides a consistent * read value because there is at most one CPU modifying * console->flags and that CPU is using only read-modify-write * operations to do so. */ return data_race(READ_ONCE(con->flags)); } /** * console_srcu_write_flags - Write flags for a registered console * @con: struct console pointer of console to write flags to * @flags: new flags value to write * * Only use this function to write flags for registered consoles. It * requires holding the console_list_lock. * * Context: Any context. */ static inline void console_srcu_write_flags(struct console *con, short flags) { lockdep_assert_console_list_lock_held(); /* This matches the READ_ONCE() in console_srcu_read_flags(). */ WRITE_ONCE(con->flags, flags); } /* Variant of console_is_registered() when the console_list_lock is held. */ static inline bool console_is_registered_locked(const struct console *con) { lockdep_assert_console_list_lock_held(); return !hlist_unhashed(&con->node); } /* * console_is_registered - Check if the console is registered * @con: struct console pointer of console to check * * Context: Process context. May sleep while acquiring console list lock. * Return: true if the console is in the console list, otherwise false. * * If false is returned for a console that was previously registered, it * can be assumed that the console's unregistration is fully completed, * including the exit() callback after console list removal. */ static inline bool console_is_registered(const struct console *con) { bool ret; console_list_lock(); ret = console_is_registered_locked(con); console_list_unlock(); return ret; } /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * Although SRCU guarantees the console list will be consistent, the * struct console fields may be updated by other CPUs while iterating. * * Requires console_srcu_read_lock to be held. Can be invoked from * any context. */ #define for_each_console_srcu(con) \ hlist_for_each_entry_srcu(con, &console_list, node, \ console_srcu_read_lock_is_held()) /** * for_each_console() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * The console list and the console->flags are immutable while iterating. * * Requires console_list_lock to be held. */ #define for_each_console(con) \ lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) #ifdef CONFIG_PRINTK extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); #else static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } #endif extern int console_set_on_cmdline; extern struct console *early_console; enum con_flush_mode { CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; extern int add_preferred_console(const char *name, const short idx, char *options); extern void console_force_preferred_locked(struct console *con); extern void register_console(struct console *); extern int unregister_console(struct console *); extern void console_lock(void); extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_stop(struct console *); extern void console_start(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, char *console_options, char *braille_options); extern int braille_unregister_console(struct console *); #ifdef CONFIG_TTY extern void console_sysfs_notify(void); #else static inline void console_sysfs_notify(void) { } #endif extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ extern void suspend_console(void); extern void resume_console(void); int mda_console_init(void); void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ #define WARN_CONSOLE_UNLOCKED() \ WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ !is_console_locked() && !oops_in_progress) /* * Increment ignore_console_lock_warning if you need to quiet * WARN_CONSOLE_UNLOCKED() for debugging purposes. */ extern atomic_t ignore_console_lock_warning; /* VESA Blanking Levels */ #define VESA_NO_BLANKING 0 #define VESA_VSYNC_SUSPEND 1 #define VESA_HSYNC_SUSPEND 2 #define VESA_POWERDOWN 3 extern void console_init(void); /* For deferred console takeover */ void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); #endif /* _LINUX_CONSOLE_H */
1468 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 // SPDX-License-Identifier: GPL-2.0-only /* net/atm/clip.c - RFC1577 Classical IP over ATM */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> /* for UINT_MAX */ #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/timer.h> #include <linux/if_arp.h> /* for some manifest constants */ #include <linux/notifier.h> #include <linux/atm.h> #include <linux/atmdev.h> #include <linux/atmclip.h> #include <linux/atmarp.h> #include <linux/capability.h> #include <linux/ip.h> /* for net/route.h */ #include <linux/in.h> /* for struct sockaddr_in */ #include <linux/if.h> /* for IFF_UP */ #include <linux/inetdevice.h> #include <linux/bitops.h> #include <linux/poison.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <linux/jhash.h> #include <linux/slab.h> #include <net/route.h> /* for struct rtable and routing */ #include <net/icmp.h> /* icmp_send */ #include <net/arp.h> #include <linux/param.h> /* for HZ */ #include <linux/uaccess.h> #include <asm/byteorder.h> /* for htons etc. */ #include <linux/atomic.h> #include "common.h" #include "resources.h" #include <net/atmclip.h> static struct net_device *clip_devs; static struct atm_vcc *atmarpd; static struct timer_list idle_timer; static const struct neigh_ops clip_neigh_ops; static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) { struct sock *sk; struct atmarp_ctrl *ctrl; struct sk_buff *skb; pr_debug("(%d)\n", type); if (!atmarpd) return -EUNATCH; skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC); if (!skb) return -ENOMEM; ctrl = skb_put(skb, sizeof(struct atmarp_ctrl)); ctrl->type = type; ctrl->itf_num = itf; ctrl->ip = ip; atm_force_charge(atmarpd, skb->truesize); sk = sk_atm(atmarpd); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return 0; } static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry) { pr_debug("%p to entry %p (neigh %p)\n", clip_vcc, entry, entry->neigh); clip_vcc->entry = entry; clip_vcc->xoff = 0; /* @@@ may overrun buffer by one packet */ clip_vcc->next = entry->vccs; entry->vccs = clip_vcc; entry->neigh->used = jiffies; } static void unlink_clip_vcc(struct clip_vcc *clip_vcc) { struct atmarp_entry *entry = clip_vcc->entry; struct clip_vcc **walk; if (!entry) { pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); return; } netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ entry->neigh->used = jiffies; for (walk = &entry->vccs; *walk; walk = &(*walk)->next) if (*walk == clip_vcc) { int error; *walk = clip_vcc->next; /* atomic */ clip_vcc->entry = NULL; if (clip_vcc->xoff) netif_wake_queue(entry->neigh->dev); if (entry->vccs) goto out; entry->expires = jiffies - 1; /* force resolution or expiration */ error = neigh_update(entry->neigh, NULL, NUD_NONE, NEIGH_UPDATE_F_ADMIN, 0); if (error) pr_err("neigh_update failed with %d\n", error); goto out; } pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc); out: netif_tx_unlock_bh(entry->neigh->dev); } /* The neighbour entry n->lock is held. */ static int neigh_check_cb(struct neighbour *n) { struct atmarp_entry *entry = neighbour_priv(n); struct clip_vcc *cv; if (n->ops != &clip_neigh_ops) return 0; for (cv = entry->vccs; cv; cv = cv->next) { unsigned long exp = cv->last_use + cv->idle_timeout; if (cv->idle_timeout && time_after(jiffies, exp)) { pr_debug("releasing vcc %p->%p of entry %p\n", cv, cv->vcc, entry); vcc_release_async(cv->vcc, -ETIMEDOUT); } } if (entry->vccs || time_before(jiffies, entry->expires)) return 0; if (refcount_read(&n->refcnt) > 1) { struct sk_buff *skb; pr_debug("destruction postponed with ref %d\n", refcount_read(&n->refcnt)); while ((skb = skb_dequeue(&n->arp_queue)) != NULL) dev_kfree_skb(skb); return 0; } pr_debug("expired neigh %p\n", n); return 1; } static void idle_timer_check(struct timer_list *unused) { write_lock(&arp_tbl.lock); __neigh_for_each_release(&arp_tbl, neigh_check_cb); mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); write_unlock(&arp_tbl.lock); } static int clip_arp_rcv(struct sk_buff *skb) { struct atm_vcc *vcc; pr_debug("\n"); vcc = ATM_SKB(skb)->vcc; if (!vcc || !atm_charge(vcc, skb->truesize)) { dev_kfree_skb_any(skb); return 0; } pr_debug("pushing to %p\n", vcc); pr_debug("using %p\n", CLIP_VCC(vcc)->old_push); CLIP_VCC(vcc)->old_push(vcc, skb); return 0; } static const unsigned char llc_oui[] = { 0xaa, /* DSAP: non-ISO */ 0xaa, /* SSAP: non-ISO */ 0x03, /* Ctrl: Unnumbered Information Command PDU */ 0x00, /* OUI: EtherType */ 0x00, 0x00 }; static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) { struct clip_vcc *clip_vcc = CLIP_VCC(vcc); pr_debug("\n"); if (!clip_devs) { atm_return(vcc, skb->truesize); kfree_skb(skb); return; } if (!skb) { pr_debug("removing VCC %p\n", clip_vcc); if (clip_vcc->entry) unlink_clip_vcc(clip_vcc); clip_vcc->old_push(vcc, NULL); /* pass on the bad news */ kfree(clip_vcc); return; } atm_return(vcc, skb->truesize); skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; /* clip_vcc->entry == NULL if we don't have an IP address yet */ if (!skb->dev) { dev_kfree_skb_any(skb); return; } ATM_SKB(skb)->vcc = vcc; skb_reset_mac_header(skb); if (!clip_vcc->encap || skb->len < RFC1483LLC_LEN || memcmp(skb->data, llc_oui, sizeof(llc_oui))) skb->protocol = htons(ETH_P_IP); else { skb->protocol = ((__be16 *)skb->data)[3]; skb_pull(skb, RFC1483LLC_LEN); if (skb->protocol == htons(ETH_P_ARP)) { skb->dev->stats.rx_packets++; skb->dev->stats.rx_bytes += skb->len; clip_arp_rcv(skb); return; } } clip_vcc->last_use = jiffies; skb->dev->stats.rx_packets++; skb->dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); } /* * Note: these spinlocks _must_not_ block on non-SMP. The only goal is that * clip_pop is atomic with respect to the critical section in clip_start_xmit. */ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb) { struct clip_vcc *clip_vcc = CLIP_VCC(vcc); struct net_device *dev = skb->dev; int old; unsigned long flags; pr_debug("(vcc %p)\n", vcc); clip_vcc->old_pop(vcc, skb); /* skb->dev == NULL in outbound ARP packets */ if (!dev) return; spin_lock_irqsave(&PRIV(dev)->xoff_lock, flags); if (atm_may_send(vcc, 0)) { old = xchg(&clip_vcc->xoff, 0); if (old) netif_wake_queue(dev); } spin_unlock_irqrestore(&PRIV(dev)->xoff_lock, flags); } static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 *ip = (__be32 *) neigh->primary_key; pr_debug("(neigh %p, skb %p)\n", neigh, skb); to_atmarpd(act_need, PRIV(neigh->dev)->number, *ip); } static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb) { #ifndef CONFIG_ATM_CLIP_NO_ICMP icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); #endif kfree_skb(skb); } static const struct neigh_ops clip_neigh_ops = { .family = AF_INET, .solicit = clip_neigh_solicit, .error_report = clip_neigh_error, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; static int clip_constructor(struct net_device *dev, struct neighbour *neigh) { struct atmarp_entry *entry = neighbour_priv(neigh); if (neigh->tbl->family != AF_INET) return -EINVAL; if (neigh->type != RTN_UNICAST) return -EINVAL; neigh->nud_state = NUD_NONE; neigh->ops = &clip_neigh_ops; neigh->output = neigh->ops->output; entry->neigh = neigh; entry->vccs = NULL; entry->expires = jiffies - 1; return 0; } /* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */ /* * We play with the resolve flag: 0 and 1 have the usual meaning, but -1 means * to allocate the neighbour entry but not to ask atmarpd for resolution. Also, * don't increment the usage count. This is used to create entries in * clip_setentry. */ static int clip_encap(struct atm_vcc *vcc, int mode) { if (!CLIP_VCC(vcc)) return -EBADFD; CLIP_VCC(vcc)->encap = mode; return 0; } static netdev_tx_t clip_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct clip_priv *clip_priv = PRIV(dev); struct dst_entry *dst = skb_dst(skb); struct atmarp_entry *entry; struct neighbour *n; struct atm_vcc *vcc; struct rtable *rt; __be32 *daddr; int old; unsigned long flags; pr_debug("(skb %p)\n", skb); if (!dst) { pr_err("skb_dst(skb) == NULL\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } rt = (struct rtable *) dst; if (rt->rt_gw_family == AF_INET) daddr = &rt->rt_gw4; else daddr = &ip_hdr(skb)->daddr; n = dst_neigh_lookup(dst, daddr); if (!n) { pr_err("NO NEIGHBOUR !\n"); dev_kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } entry = neighbour_priv(n); if (!entry->vccs) { if (time_after(jiffies, entry->expires)) { /* should be resolved */ entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ; to_atmarpd(act_need, PRIV(dev)->number, *((__be32 *)n->primary_key)); } if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS) skb_queue_tail(&entry->neigh->arp_queue, skb); else { dev_kfree_skb(skb); dev->stats.tx_dropped++; } goto out_release_neigh; } pr_debug("neigh %p, vccs %p\n", entry, entry->vccs); ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc; pr_debug("using neighbour %p, vcc %p\n", n, vcc); if (entry->vccs->encap) { void *here; here = skb_push(skb, RFC1483LLC_LEN); memcpy(here, llc_oui, sizeof(llc_oui)); ((__be16 *) here)[3] = skb->protocol; } atm_account_tx(vcc, skb); entry->vccs->last_use = jiffies; pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev); old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */ if (old) { pr_warn("XOFF->XOFF transition\n"); goto out_release_neigh; } dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; vcc->send(vcc, skb); if (atm_may_send(vcc, 0)) { entry->vccs->xoff = 0; goto out_release_neigh; } spin_lock_irqsave(&clip_priv->xoff_lock, flags); netif_stop_queue(dev); /* XOFF -> throttle immediately */ barrier(); if (!entry->vccs->xoff) netif_start_queue(dev); /* Oh, we just raced with clip_pop. netif_start_queue should be good enough, because nothing should really be asleep because of the brief netif_stop_queue. If this isn't true or if it changes, use netif_wake_queue instead. */ spin_unlock_irqrestore(&clip_priv->xoff_lock, flags); out_release_neigh: neigh_release(n); return NETDEV_TX_OK; } static int clip_mkip(struct atm_vcc *vcc, int timeout) { struct clip_vcc *clip_vcc; if (!vcc->push) return -EBADFD; clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL); if (!clip_vcc) return -ENOMEM; pr_debug("%p vcc %p\n", clip_vcc, vcc); clip_vcc->vcc = vcc; vcc->user_back = clip_vcc; set_bit(ATM_VF_IS_CLIP, &vcc->flags); clip_vcc->entry = NULL; clip_vcc->xoff = 0; clip_vcc->encap = 1; clip_vcc->last_use = jiffies; clip_vcc->idle_timeout = timeout * HZ; clip_vcc->old_push = vcc->push; clip_vcc->old_pop = vcc->pop; vcc->push = clip_push; vcc->pop = clip_pop; /* re-process everything received between connection setup and MKIP */ vcc_process_recv_queue(vcc); return 0; } static int clip_setentry(struct atm_vcc *vcc, __be32 ip) { struct neighbour *neigh; struct atmarp_entry *entry; int error; struct clip_vcc *clip_vcc; struct rtable *rt; if (vcc->push != clip_push) { pr_warn("non-CLIP VCC\n"); return -EBADF; } clip_vcc = CLIP_VCC(vcc); if (!ip) { if (!clip_vcc->entry) { pr_err("hiding hidden ATMARP entry\n"); return 0; } pr_debug("remove\n"); unlink_clip_vcc(clip_vcc); return 0; } rt = ip_route_output(&init_net, ip, 0, 1, 0); if (IS_ERR(rt)) return PTR_ERR(rt); neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1); ip_rt_put(rt); if (!neigh) return -ENOMEM; entry = neighbour_priv(neigh); if (entry != clip_vcc->entry) { if (!clip_vcc->entry) pr_debug("add\n"); else { pr_debug("update\n"); unlink_clip_vcc(clip_vcc); } link_vcc(clip_vcc, entry); } error = neigh_update(neigh, llc_oui, NUD_PERMANENT, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); return error; } static const struct net_device_ops clip_netdev_ops = { .ndo_start_xmit = clip_start_xmit, .ndo_neigh_construct = clip_constructor, }; static void clip_setup(struct net_device *dev) { dev->netdev_ops = &clip_netdev_ops; dev->type = ARPHRD_ATM; dev->neigh_priv_len = sizeof(struct atmarp_entry); dev->hard_header_len = RFC1483LLC_LEN; dev->mtu = RFC1626_MTU; dev->tx_queue_len = 100; /* "normal" queue (packets) */ /* When using a "real" qdisc, the qdisc determines the queue */ /* length. tx_queue_len is only used for the default case, */ /* without any more elaborate queuing. 100 is a reasonable */ /* compromise between decent burst-tolerance and protection */ /* against memory hogs. */ netif_keep_dst(dev); } static int clip_create(int number) { struct net_device *dev; struct clip_priv *clip_priv; int error; if (number != -1) { for (dev = clip_devs; dev; dev = PRIV(dev)->next) if (PRIV(dev)->number == number) return -EEXIST; } else { number = 0; for (dev = clip_devs; dev; dev = PRIV(dev)->next) if (PRIV(dev)->number >= number) number = PRIV(dev)->number + 1; } dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN, clip_setup); if (!dev) return -ENOMEM; clip_priv = PRIV(dev); sprintf(dev->name, "atm%d", number); spin_lock_init(&clip_priv->xoff_lock); clip_priv->number = number; error = register_netdev(dev); if (error) { free_netdev(dev); return error; } clip_priv->next = clip_devs; clip_devs = dev; pr_debug("registered (net:%s)\n", dev->name); return number; } static int clip_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) return NOTIFY_DONE; /* ignore non-CLIP devices */ if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops) return NOTIFY_DONE; switch (event) { case NETDEV_UP: pr_debug("NETDEV_UP\n"); to_atmarpd(act_up, PRIV(dev)->number, 0); break; case NETDEV_GOING_DOWN: pr_debug("NETDEV_DOWN\n"); to_atmarpd(act_down, PRIV(dev)->number, 0); break; case NETDEV_CHANGE: case NETDEV_CHANGEMTU: pr_debug("NETDEV_CHANGE*\n"); to_atmarpd(act_change, PRIV(dev)->number, 0); break; } return NOTIFY_DONE; } static int clip_inet_event(struct notifier_block *this, unsigned long event, void *ifa) { struct in_device *in_dev; struct netdev_notifier_info info; in_dev = ((struct in_ifaddr *)ifa)->ifa_dev; /* * Transitions are of the down-change-up type, so it's sufficient to * handle the change on up. */ if (event != NETDEV_UP) return NOTIFY_DONE; netdev_notifier_info_init(&info, in_dev->dev); return clip_device_event(this, NETDEV_CHANGE, &info); } static struct notifier_block clip_dev_notifier = { .notifier_call = clip_device_event, }; static struct notifier_block clip_inet_notifier = { .notifier_call = clip_inet_event, }; static void atmarpd_close(struct atm_vcc *vcc) { pr_debug("\n"); rtnl_lock(); atmarpd = NULL; skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); rtnl_unlock(); pr_debug("(done)\n"); module_put(THIS_MODULE); } static const struct atmdev_ops atmarpd_dev_ops = { .close = atmarpd_close }; static struct atm_dev atmarpd_dev = { .ops = &atmarpd_dev_ops, .type = "arpd", .number = 999, .lock = __SPIN_LOCK_UNLOCKED(atmarpd_dev.lock) }; static int atm_init_atmarp(struct atm_vcc *vcc) { rtnl_lock(); if (atmarpd) { rtnl_unlock(); return -EADDRINUSE; } mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); atmarpd = vcc; set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); /* allow replies and avoid getting closed if signaling dies */ vcc->dev = &atmarpd_dev; vcc_insert_socket(sk_atm(vcc)); vcc->push = NULL; vcc->pop = NULL; /* crash */ vcc->push_oam = NULL; /* crash */ rtnl_unlock(); return 0; } static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct atm_vcc *vcc = ATM_SD(sock); int err = 0; switch (cmd) { case SIOCMKCLIP: case ATMARPD_CTRL: case ATMARP_MKIP: case ATMARP_SETENTRY: case ATMARP_ENCAP: if (!capable(CAP_NET_ADMIN)) return -EPERM; break; default: return -ENOIOCTLCMD; } switch (cmd) { case SIOCMKCLIP: err = clip_create(arg); break; case ATMARPD_CTRL: err = atm_init_atmarp(vcc); if (!err) { sock->state = SS_CONNECTED; __module_get(THIS_MODULE); } break; case ATMARP_MKIP: err = clip_mkip(vcc, arg); break; case ATMARP_SETENTRY: err = clip_setentry(vcc, (__force __be32)arg); break; case ATMARP_ENCAP: err = clip_encap(vcc, arg); break; } return err; } static struct atm_ioctl clip_ioctl_ops = { .owner = THIS_MODULE, .ioctl = clip_ioctl, }; #ifdef CONFIG_PROC_FS static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr) { static int code[] = { 1, 2, 10, 6, 1, 0 }; static int e164[] = { 1, 8, 4, 6, 1, 0 }; if (*addr->sas_addr.pub) { seq_printf(seq, "%s", addr->sas_addr.pub); if (*addr->sas_addr.prv) seq_putc(seq, '+'); } else if (!*addr->sas_addr.prv) { seq_printf(seq, "%s", "(none)"); return; } if (*addr->sas_addr.prv) { unsigned char *prv = addr->sas_addr.prv; int *fields; int i, j; fields = *prv == ATM_AFI_E164 ? e164 : code; for (i = 0; fields[i]; i++) { for (j = fields[i]; j; j--) seq_printf(seq, "%02X", *prv++); if (fields[i + 1]) seq_putc(seq, '.'); } } } /* This means the neighbour entry has no attached VCC objects. */ #define SEQ_NO_VCC_TOKEN ((void *) 2) static void atmarp_info(struct seq_file *seq, struct neighbour *n, struct atmarp_entry *entry, struct clip_vcc *clip_vcc) { struct net_device *dev = n->dev; unsigned long exp; char buf[17]; int svc, llc, off; svc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || (sk_atm(clip_vcc->vcc)->sk_family == AF_ATMSVC)); llc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || clip_vcc->encap); if (clip_vcc == SEQ_NO_VCC_TOKEN) exp = entry->neigh->used; else exp = clip_vcc->last_use; exp = (jiffies - exp) / HZ; seq_printf(seq, "%-6s%-4s%-4s%5ld ", dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp); off = scnprintf(buf, sizeof(buf) - 1, "%pI4", n->primary_key); while (off < 16) buf[off++] = ' '; buf[off] = '\0'; seq_printf(seq, "%s", buf); if (clip_vcc == SEQ_NO_VCC_TOKEN) { if (time_before(jiffies, entry->expires)) seq_printf(seq, "(resolving)\n"); else seq_printf(seq, "(expired, ref %d)\n", refcount_read(&entry->neigh->refcnt)); } else if (!svc) { seq_printf(seq, "%d.%d.%d\n", clip_vcc->vcc->dev->number, clip_vcc->vcc->vpi, clip_vcc->vcc->vci); } else { svc_addr(seq, &clip_vcc->vcc->remote); seq_putc(seq, '\n'); } } struct clip_seq_state { /* This member must be first. */ struct neigh_seq_state ns; /* Local to clip specific iteration. */ struct clip_vcc *vcc; }; static struct clip_vcc *clip_seq_next_vcc(struct atmarp_entry *e, struct clip_vcc *curr) { if (!curr) { curr = e->vccs; if (!curr) return SEQ_NO_VCC_TOKEN; return curr; } if (curr == SEQ_NO_VCC_TOKEN) return NULL; curr = curr->next; return curr; } static void *clip_seq_vcc_walk(struct clip_seq_state *state, struct atmarp_entry *e, loff_t * pos) { struct clip_vcc *vcc = state->vcc; vcc = clip_seq_next_vcc(e, vcc); if (vcc && pos != NULL) { while (*pos) { vcc = clip_seq_next_vcc(e, vcc); if (!vcc) break; --(*pos); } } state->vcc = vcc; return vcc; } static void *clip_seq_sub_iter(struct neigh_seq_state *_state, struct neighbour *n, loff_t * pos) { struct clip_seq_state *state = (struct clip_seq_state *)_state; if (n->dev->type != ARPHRD_ATM) return NULL; return clip_seq_vcc_walk(state, neighbour_priv(n), pos); } static void *clip_seq_start(struct seq_file *seq, loff_t * pos) { struct clip_seq_state *state = seq->private; state->ns.neigh_sub_iter = clip_seq_sub_iter; return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_NEIGH_ONLY); } static int clip_seq_show(struct seq_file *seq, void *v) { static char atm_arp_banner[] = "IPitf TypeEncp Idle IP address ATM address\n"; if (v == SEQ_START_TOKEN) { seq_puts(seq, atm_arp_banner); } else { struct clip_seq_state *state = seq->private; struct clip_vcc *vcc = state->vcc; struct neighbour *n = v; atmarp_info(seq, n, neighbour_priv(n), vcc); } return 0; } static const struct seq_operations arp_seq_ops = { .start = clip_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = clip_seq_show, }; #endif static void atm_clip_exit_noproc(void); static int __init atm_clip_init(void) { register_atm_ioctl(&clip_ioctl_ops); register_netdevice_notifier(&clip_dev_notifier); register_inetaddr_notifier(&clip_inet_notifier); timer_setup(&idle_timer, idle_timer_check, 0); #ifdef CONFIG_PROC_FS { struct proc_dir_entry *p; p = proc_create_net("arp", 0444, atm_proc_root, &arp_seq_ops, sizeof(struct clip_seq_state)); if (!p) { pr_err("Unable to initialize /proc/net/atm/arp\n"); atm_clip_exit_noproc(); return -ENOMEM; } } #endif return 0; } static void atm_clip_exit_noproc(void) { struct net_device *dev, *next; unregister_inetaddr_notifier(&clip_inet_notifier); unregister_netdevice_notifier(&clip_dev_notifier); deregister_atm_ioctl(&clip_ioctl_ops); /* First, stop the idle timer, so it stops banging * on the table. */ del_timer_sync(&idle_timer); dev = clip_devs; while (dev) { next = PRIV(dev)->next; unregister_netdev(dev); free_netdev(dev); dev = next; } } static void __exit atm_clip_exit(void) { remove_proc_entry("arp", atm_proc_root); atm_clip_exit_noproc(); } module_init(atm_clip_init); module_exit(atm_clip_exit); MODULE_AUTHOR("Werner Almesberger"); MODULE_DESCRIPTION("Classical/IP over ATM interface"); MODULE_LICENSE("GPL");
40 40 2 1 847 847 847 847 847 5 13 13 8 1 1 1 1 1 1 1 22 22 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/workqueue.h> #include <linux/rtnetlink.h> #include <linux/cache.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/delay.h> #include <linux/sched.h> #include <linux/idr.h> #include <linux/rculist.h> #include <linux/nsproxy.h> #include <linux/fs.h> #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/net_namespace.h> #include <linux/sched/task.h> #include <linux/uidgid.h> #include <linux/cookie.h> #include <linux/proc_fs.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> /* * Our network namespace constructor/destructor lists */ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); /* Protects net_namespace_list. Nests iside rtnl_lock() */ DECLARE_RWSEM(net_rwsem); EXPORT_SYMBOL_GPL(net_rwsem); #ifdef CONFIG_KEYS static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif struct net init_net; EXPORT_SYMBOL(init_net); static bool init_net_initialized; /* * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, * init_net_initialized and first_device pointer. * This is internal net namespace object. Please, don't use it * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; DEFINE_COOKIE(net_cookie); static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); ng = kzalloc(generic_size, GFP_KERNEL); if (ng) ng->s.len = max_gen_ptrs; return ng; } static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); if (old_ng->s.len > id) { old_ng->ptr[id] = data; return 0; } ng = net_alloc_generic(); if (!ng) return -ENOMEM; /* * Some synchronisation notes: * * The net_generic explores the net->gen array inside rcu * read section. Besides once set the net->gen->ptr[x] * pointer never changes (see rules in netns/generic.h). * * That said, we simply duplicate this array and schedule * the old copy for kfree after a grace period. */ memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID], (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *)); ng->ptr[id] = data; rcu_assign_pointer(net->gen, ng); kfree_rcu(old_ng, s.rcu); return 0; } static int ops_init(const struct pernet_operations *ops, struct net *net) { struct net_generic *ng; int err = -ENOMEM; void *data = NULL; if (ops->id && ops->size) { data = kzalloc(ops->size, GFP_KERNEL); if (!data) goto out; err = net_assign_generic(net, *ops->id, data); if (err) goto cleanup; } err = 0; if (ops->init) err = ops->init(net); if (!err) return 0; if (ops->id && ops->size) { ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; } cleanup: kfree(data); out: return err; } static void ops_pre_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; if (ops->pre_exit) { list_for_each_entry(net, net_exit_list, exit_list) ops->pre_exit(net); } } static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; if (ops->exit) { list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } if (ops->exit_batch) ops->exit_batch(net_exit_list); } static void ops_free_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; if (ops->size && ops->id) { list_for_each_entry(net, net_exit_list, exit_list) kfree(net_generic(net, *ops->id)); } } /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { int min = 0, max = 0; if (reqid >= 0) { min = reqid; max = reqid + 1; } return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC); } /* This function is used by idr_for_each(). If net is equal to peer, the * function returns the id so that idr_for_each() stops. Because we cannot * returns the id 0 (idr_for_each() will not stop), we return the magic value * NET_ID_ZERO (-1) for it. */ #define NET_ID_ZERO -1 static int net_eq_idr(int id, void *net, void *peer) { if (net_eq(net, peer)) return id ? : NET_ID_ZERO; return 0; } /* Must be called from RCU-critical section or with nsid_lock held */ static int __peernet2id(const struct net *net, struct net *peer) { int id = idr_for_each(&net->netns_ids, net_eq_idr, peer); /* Magic value for id 0. */ if (id == NET_ID_ZERO) return 0; if (id > 0) return id; return NETNSA_NSID_NOT_ASSIGNED; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, struct nlmsghdr *nlh, gfp_t gfp); /* This function returns the id of a peer netns. If no id is assigned, one will * be allocated and returned. */ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; if (refcount_read(&net->ns.count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); id = __peernet2id(net, peer); if (id >= 0) { spin_unlock_bh(&net->nsid_lock); return id; } /* When peer is obtained from RCU lists, we may race with * its cleanup. Check whether it's alive, and this guarantees * we never hash a peer back to net->netns_ids, after it has * just been idr_remove()'d from there in cleanup_net(). */ if (!maybe_get_net(peer)) { spin_unlock_bh(&net->nsid_lock); return NETNSA_NSID_NOT_ASSIGNED; } id = alloc_netid(net, peer, -1); spin_unlock_bh(&net->nsid_lock); put_net(peer); if (id < 0) return NETNSA_NSID_NOT_ASSIGNED; rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp); return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ int peernet2id(const struct net *net, struct net *peer) { int id; rcu_read_lock(); id = __peernet2id(net, peer); rcu_read_unlock(); return id; } EXPORT_SYMBOL(peernet2id); /* This function returns true is the peer netns has an id assigned into the * current netns. */ bool peernet_has_id(const struct net *net, struct net *peer) { return peernet2id(net, peer) >= 0; } struct net *get_net_ns_by_id(const struct net *net, int id) { struct net *peer; if (id < 0) return NULL; rcu_read_lock(); peer = idr_find(&net->netns_ids, id); if (peer) peer = maybe_get_net(peer); rcu_read_unlock(); return peer; } EXPORT_SYMBOL_GPL(get_net_ns_by_id); /* init code that must occur even if setup_net() is not called. */ static __net_init void preinit_net(struct net *net) { ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); } /* * setup_net runs the initializers for the network namespace object. */ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; int error = 0; LIST_HEAD(net_exit_list); refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); preempt_disable(); net->net_cookie = gen_cookie_next(&net_cookie); preempt_enable(); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); mutex_init(&net->ipv4.ra_mutex); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); if (error < 0) goto out_undo; } down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); out: return error; out_undo: /* Walk through the list backwards calling the exit functions * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); saved_ops = ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_pre_exit_list(ops, &net_exit_list); synchronize_rcu(); ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); rcu_barrier(); goto out; } static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; return 0; } static struct pernet_operations net_defaults_ops = { .init = net_defaults_init_net, }; static __init int net_defaults_init(void) { if (register_pernet_subsys(&net_defaults_ops)) panic("Cannot initialize net default settings"); return 0; } core_initcall(net_defaults_init); #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES); } static void dec_net_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); } static struct kmem_cache *net_cachep __ro_after_init; static struct workqueue_struct *netns_wq; static struct net *net_alloc(void) { struct net *net = NULL; struct net_generic *ng; ng = net_alloc_generic(); if (!ng) goto out; net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); if (!net) goto out_free; #ifdef CONFIG_KEYS net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL); if (!net->key_domain) goto out_free_2; refcount_set(&net->key_domain->usage, 1); #endif rcu_assign_pointer(net->gen, ng); out: return net; #ifdef CONFIG_KEYS out_free_2: kmem_cache_free(net_cachep, net); net = NULL; #endif out_free: kfree(ng); goto out; } static void net_free(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); /* There should not be any trackers left there. */ ref_tracker_dir_exit(&net->notrefcnt_tracker); kmem_cache_free(net_cachep, net); } } void net_drop_ns(void *p) { struct net *net = (struct net *)p; if (net) net_free(net); } struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; struct net *net; int rv; if (!(flags & CLONE_NEWNET)) return get_net(old_net); ucounts = inc_net_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); net = net_alloc(); if (!net) { rv = -ENOMEM; goto dec_ucounts; } preinit_net(net); refcount_set(&net->passive, 1); net->ucounts = ucounts; get_user_ns(user_ns); rv = down_read_killable(&pernet_ops_rwsem); if (rv < 0) goto put_userns; rv = setup_net(net, user_ns); up_read(&pernet_ops_rwsem); if (rv < 0) { put_userns: #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); net_free(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); } return net; } /** * net_ns_get_ownership - get sysfs ownership data for @net * @net: network namespace in question (can be NULL) * @uid: kernel user ID for sysfs objects * @gid: kernel group ID for sysfs objects * * Returns the uid/gid pair of root in the user namespace associated with the * given network namespace. */ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) { if (net) { kuid_t ns_root_uid = make_kuid(net->user_ns, 0); kgid_t ns_root_gid = make_kgid(net->user_ns, 0); if (uid_valid(ns_root_uid)) *uid = ns_root_uid; if (gid_valid(ns_root_gid)) *gid = ns_root_gid; } else { *uid = GLOBAL_ROOT_UID; *gid = GLOBAL_ROOT_GID; } } EXPORT_SYMBOL_GPL(net_ns_get_ownership); static void unhash_nsid(struct net *net, struct net *last) { struct net *tmp; /* This function is only called from cleanup_net() work, * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below * is executing, the list may only grow. Thus, we do not * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { int id; spin_lock_bh(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); spin_unlock_bh(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, GFP_KERNEL); if (tmp == last) break; } spin_lock_bh(&net->nsid_lock); idr_destroy(&net->netns_ids); spin_unlock_bh(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; struct net *net, *tmp, *last; struct llist_node *net_kill_list; LIST_HEAD(net_exit_list); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ down_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) list_del_rcu(&net->list); /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). * So, we skip them in unhash_nsid(). * * Note, that unhash_nsid() does not delete nsid links * between net_kill_list's nets, as they've already * deleted from net_namespace_list. But, this would be * useless anyway, as netns_ids are destroyed there. */ last = list_last_entry(&net_namespace_list, struct net, list); up_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) { unhash_nsid(net, last); list_add_tail(&net->exit_list, &net_exit_list); } /* Run all of the network namespace pre_exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_pre_exit_list(ops, &net_exit_list); /* * Another CPU might be rcu-iterating the list, wait for it. * This needs to be before calling the exit() notifiers, so * the rcu_barrier() below isn't sufficient alone. * Also the pre_exit() and exit() methods need this barrier. */ synchronize_rcu(); /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); /* Free the net generic variables */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); up_read(&pernet_ops_rwsem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. */ rcu_barrier(); /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); net_free(net); } } /** * net_ns_barrier - wait until concurrent net_cleanup_work is done * * cleanup_net runs from work queue and will first remove namespaces * from the global list, then run net exit functions. * * Call this in module exit path to make sure that all netns * ->exit ops have been invoked before the function is removed. */ void net_ns_barrier(void) { down_write(&pernet_ops_rwsem); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL(net_ns_barrier); static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { ref_tracker_dir_exit(&net->refcnt_tracker); /* Cleanup the network namespace in process context */ if (llist_add(&net->cleanup_list, &cleanup_list)) queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); /** * get_net_ns - increment the refcount of the network namespace * @ns: common namespace (net) * * Returns the net's common namespace. */ struct ns_common *get_net_ns(struct ns_common *ns) { return &get_net(container_of(ns, struct net, ns))->ns; } EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { struct fd f = fdget(fd); struct net *net = ERR_PTR(-EINVAL); if (!f.file) return ERR_PTR(-EBADF); if (proc_ns_file(f.file)) { struct ns_common *ns = get_proc_ns(file_inode(f.file)); if (ns->ops == &netns_operations) net = get_net(container_of(ns, struct net, ns)); } fdput(f); return net; } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif struct net *get_net_ns_by_pid(pid_t pid) { struct task_struct *tsk; struct net *net; /* Lookup the network namespace */ net = ERR_PTR(-ESRCH); rcu_read_lock(); tsk = find_task_by_vpid(pid); if (tsk) { struct nsproxy *nsproxy; task_lock(tsk); nsproxy = tsk->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); task_unlock(tsk); } rcu_read_unlock(); return net; } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); static __net_init int net_ns_net_init(struct net *net) { #ifdef CONFIG_NET_NS net->ns.ops = &netns_operations; #endif return ns_alloc_inum(&net->ns); } static __net_exit void net_ns_net_exit(struct net *net) { ns_free_inum(&net->ns); } static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NONE] = { .type = NLA_UNSPEC }, [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, [NETNSA_FD] = { .type = NLA_U32 }, [NETNSA_TARGET_NSID] = { .type = NLA_S32 }, }; static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; struct nlattr *nla; struct net *peer; int nsid, err; err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err < 0) return err; if (!tb[NETNSA_NSID]) { NL_SET_ERR_MSG(extack, "nsid is missing"); return -EINVAL; } nsid = nla_get_s32(tb[NETNSA_NSID]); if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); nla = tb[NETNSA_PID]; } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; } if (IS_ERR(peer)) { NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); } spin_lock_bh(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { spin_unlock_bh(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns already has a nsid assigned"); goto out; } err = alloc_netid(net, peer, nsid); spin_unlock_bh(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, nlh, GFP_KERNEL); err = 0; } else if (err == -ENOSPC && nsid >= 0) { err = -EEXIST; NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]); NL_SET_ERR_MSG(extack, "The specified nsid is already used"); } out: put_net(peer); return err; } static int rtnl_net_get_size(void) { return NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(sizeof(s32)) /* NETNSA_NSID */ + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */ ; } struct net_fill_args { u32 portid; u32 seq; int flags; int cmd; int nsid; bool add_ref; int ref_nsid; }; static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args) { struct nlmsghdr *nlh; struct rtgenmsg *rth; nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth), args->flags); if (!nlh) return -EMSGSIZE; rth = nlmsg_data(nlh); rth->rtgen_family = AF_UNSPEC; if (nla_put_s32(skb, NETNSA_NSID, args->nsid)) goto nla_put_failure; if (args->add_ref && nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int rtnl_net_valid_getid_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err) return err; for (i = 0; i <= NETNSA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETNSA_PID: case NETNSA_FD: case NETNSA_NSID: case NETNSA_TARGET_NSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request"); return -EINVAL; } } return 0; } static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; struct net_fill_args fillargs = { .portid = NETLINK_CB(skb).portid, .seq = nlh->nlmsg_seq, .cmd = RTM_NEWNSID, }; struct net *peer, *target = net; struct nlattr *nla; struct sk_buff *msg; int err; err = rtnl_net_valid_getid_req(skb, nlh, tb, extack); if (err < 0) return err; if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); nla = tb[NETNSA_PID]; } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); nla = tb[NETNSA_FD]; } else if (tb[NETNSA_NSID]) { peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID])); if (!peer) peer = ERR_PTR(-ENOENT); nla = tb[NETNSA_NSID]; } else { NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; } if (IS_ERR(peer)) { NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); } if (tb[NETNSA_TARGET_NSID]) { int id = nla_get_s32(tb[NETNSA_TARGET_NSID]); target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id); if (IS_ERR(target)) { NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]); NL_SET_ERR_MSG(extack, "Target netns reference is invalid"); err = PTR_ERR(target); goto out; } fillargs.add_ref = true; fillargs.ref_nsid = peernet2id(net, peer); } msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { err = -ENOMEM; goto out; } fillargs.nsid = peernet2id(target, peer); err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid); goto out; err_out: nlmsg_free(msg); out: if (fillargs.add_ref) put_net(target); put_net(peer); return err; } struct rtnl_net_dump_cb { struct net *tgt_net; struct net *ref_net; struct sk_buff *skb; struct net_fill_args fillargs; int idx; int s_idx; }; /* Runs in RCU-critical section. */ static int rtnl_net_dumpid_one(int id, void *peer, void *data) { struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data; int ret; if (net_cb->idx < net_cb->s_idx) goto cont; net_cb->fillargs.nsid = id; if (net_cb->fillargs.add_ref) net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer); ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs); if (ret < 0) return ret; cont: net_cb->idx++; return 0; } static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk, struct rtnl_net_dump_cb *net_cb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[NETNSA_MAX + 1]; int err, i; err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, rtnl_net_policy, extack); if (err < 0) return err; for (i = 0; i <= NETNSA_MAX; i++) { if (!tb[i]) continue; if (i == NETNSA_TARGET_NSID) { struct net *net; net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i])); if (IS_ERR(net)) { NL_SET_BAD_ATTR(extack, tb[i]); NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); return PTR_ERR(net); } net_cb->fillargs.add_ref = true; net_cb->ref_net = net_cb->tgt_net; net_cb->tgt_net = net; } else { NL_SET_BAD_ATTR(extack, tb[i]); NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) { struct rtnl_net_dump_cb net_cb = { .tgt_net = sock_net(skb->sk), .skb = skb, .fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = cb->nlh->nlmsg_seq, .flags = NLM_F_MULTI, .cmd = RTM_NEWNSID, }, .idx = 0, .s_idx = cb->args[0], }; int err = 0; if (cb->strict_check) { err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb); if (err < 0) goto end; } rcu_read_lock(); idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb); rcu_read_unlock(); cb->args[0] = net_cb.idx; end: if (net_cb.fillargs.add_ref) put_net(net_cb.tgt_net); return err < 0 ? err : skb->len; } static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid, struct nlmsghdr *nlh, gfp_t gfp) { struct net_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .cmd = cmd, .nsid = id, }; struct sk_buff *msg; int err = -ENOMEM; msg = nlmsg_new(rtnl_net_get_size(), gfp); if (!msg) goto out; err = rtnl_net_fill(msg, &fillargs); if (err < 0) goto err_out; rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp); return; err_out: nlmsg_free(msg); out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } void __init net_ns_init(void) { struct net_generic *ng; #ifdef CONFIG_NET_NS net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); if (!netns_wq) panic("Could not create netns workq"); #endif ng = net_alloc_generic(); if (!ng) panic("Could not allocate generic netns"); rcu_assign_pointer(init_net.gen, ng); #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif down_write(&pernet_ops_rwsem); preinit_net(&init_net); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); init_net_initialized = true; up_write(&pernet_ops_rwsem); if (register_pernet_subsys(&net_ns_ops)) panic("Could not register network namespace subsystems"); rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, RTNL_FLAG_DOIT_UNLOCKED); } static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) { ops_pre_exit_list(ops, net_exit_list); synchronize_rcu(); ops_exit_list(ops, net_exit_list); ops_free_list(ops, net_exit_list); } #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { struct net *net; int error; LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); if (ops->init || (ops->id && ops->size)) { /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { error = ops_init(ops, net); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); } } return 0; out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); free_exit_list(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { struct net *net; LIST_HEAD(net_exit_list); list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); free_exit_list(ops, &net_exit_list); } #else static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { if (!init_net_initialized) { list_add_tail(&ops->list, list); return 0; } return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { if (!init_net_initialized) { list_del(&ops->list); } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); free_exit_list(ops, &net_exit_list); } } #endif /* CONFIG_NET_NS */ static DEFINE_IDA(net_generic_ids); static int register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { int error; if (ops->id) { error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, GFP_KERNEL); if (error < 0) return error; *ops->id = error; max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1); } error = __register_pernet_operations(list, ops); if (error) { rcu_barrier(); if (ops->id) ida_free(&net_generic_ids, *ops->id); } return error; } static void unregister_pernet_operations(struct pernet_operations *ops) { __unregister_pernet_operations(ops); rcu_barrier(); if (ops->id) ida_free(&net_generic_ids, *ops->id); } /** * register_pernet_subsys - register a network namespace subsystem * @ops: pernet operations structure for the subsystem * * Register a subsystem which has init and exit functions * that are called when network namespaces are created and * destroyed respectively. * * When registered all network namespace init functions are * called for every existing network namespace. Allowing kernel * modules to have a race free view of the set of network namespaces. * * When a new network namespace is created all of the init * methods are called in the order in which they were registered. * * When a network namespace is destroyed all of the exit methods * are called in the reverse of the order with which they were * registered. */ int register_pernet_subsys(struct pernet_operations *ops) { int error; down_write(&pernet_ops_rwsem); error = register_pernet_operations(first_device, ops); up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); /** * unregister_pernet_subsys - unregister a network namespace subsystem * @ops: pernet operations structure to manipulate * * Remove the pernet operations structure from the list to be * used when network namespaces are created or destroyed. In * addition run the exit method for all existing network * namespaces. */ void unregister_pernet_subsys(struct pernet_operations *ops) { down_write(&pernet_ops_rwsem); unregister_pernet_operations(ops); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); /** * register_pernet_device - register a network namespace device * @ops: pernet operations structure for the subsystem * * Register a device which has init and exit functions * that are called when network namespaces are created and * destroyed respectively. * * When registered all network namespace init functions are * called for every existing network namespace. Allowing kernel * modules to have a race free view of the set of network namespaces. * * When a new network namespace is created all of the init * methods are called in the order in which they were registered. * * When a network namespace is destroyed all of the exit methods * are called in the reverse of the order with which they were * registered. */ int register_pernet_device(struct pernet_operations *ops) { int error; down_write(&pernet_ops_rwsem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); /** * unregister_pernet_device - unregister a network namespace netdevice * @ops: pernet operations structure to manipulate * * Remove the pernet operations structure from the list to be * used when network namespaces are created or destroyed. In * addition run the exit method for all existing network * namespaces. */ void unregister_pernet_device(struct pernet_operations *ops) { down_write(&pernet_ops_rwsem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); #ifdef CONFIG_NET_NS static struct ns_common *netns_get(struct task_struct *task) { struct net *net = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); task_unlock(task); return net ? &net->ns : NULL; } static inline struct net *to_net_ns(struct ns_common *ns) { return container_of(ns, struct net, ns); } static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); } static int netns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct net *net = to_net_ns(ns); if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); nsproxy->net_ns = get_net(net); return 0; } static struct user_namespace *netns_owner(struct ns_common *ns) { return to_net_ns(ns)->user_ns; } const struct proc_ns_operations netns_operations = { .name = "net", .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, .owner = netns_owner, }; #endif
2092 2092 34 5526 234 177 177 22 5117 4319 16 82 4378 4935 2040 1 34 2080 2076 2038 2080 26 2063 2063 4942 4943 4943 34 2562 215 2568 4946 2080 5283 2644 4934 4005 3827 1061 1061 1061 1061 4 189 948 4133 2477 354 113 831 830 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 // SPDX-License-Identifier: GPL-2.0 /* * ext4.h * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/include/linux/minix_fs.h * * Copyright (C) 1991, 1992 Linus Torvalds */ #ifndef _EXT4_H #define _EXT4_H #include <linux/refcount.h> #include <linux/types.h> #include <linux/blkdev.h> #include <linux/magic.h> #include <linux/jbd2.h> #include <linux/quota.h> #include <linux/rwsem.h> #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <crypto/hash.h> #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #include <linux/fiemap.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif #include <uapi/linux/ext4.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> #include <linux/compiler.h> /* * The fourth extended filesystem constants/structures */ /* * with AGGRESSIVE_CHECK allocator runs consistency checks over * structures. these checks slow things down a lot */ #define AGGRESSIVE_CHECK__ /* * with DOUBLE_CHECK defined mballoc creates persistent in-core * bitmaps, maintains and uses them to check for double allocations */ #define DOUBLE_CHECK__ /* * Define EXT4FS_DEBUG to produce debug messages */ #undef EXT4FS_DEBUG /* * Debug code */ #ifdef EXT4FS_DEBUG #define ext4_debug(f, a...) \ do { \ printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ __FILE__, __LINE__, __func__); \ printk(KERN_DEBUG f, ## a); \ } while (0) #else #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c */ #define EXT_DEBUG__ /* * Dynamic printk for controlled extents debugging. */ #ifdef CONFIG_EXT4_DEBUG #define ext_debug(ino, fmt, ...) \ pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ current->comm, task_pid_nr(current), \ ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ __func__, ##__VA_ARGS__) #else #define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define ASSERT(assert) \ do { \ if (unlikely(!(assert))) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: '%s'\n", \ __func__, __FILE__, __LINE__, #assert); \ BUG(); \ } \ } while (0) /* data type for block offset of block group */ typedef int ext4_grpblk_t; /* data type for filesystem-wide blocks number */ typedef unsigned long long ext4_fsblk_t; /* data type for file logical block number */ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; enum SHIFT_DIRECTION { SHIFT_LEFT = 0, SHIFT_RIGHT, }; /* * For each criteria, mballoc has slightly different way of finding * the required blocks nad usually, higher the criteria the slower the * allocation. We start at lower criterias and keep falling back to * higher ones if we are not able to find any blocks. Lower (earlier) * criteria are faster. */ enum criteria { /* * Used when number of blocks needed is a power of 2. This * doesn't trigger any disk IO except prefetch and is the * fastest criteria. */ CR_POWER2_ALIGNED, /* * Tries to lookup in-memory data structures to find the most * suitable group that satisfies goal request. No disk IO * except block prefetch. */ CR_GOAL_LEN_FAST, /* * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal * length to the best available length for faster allocation. */ CR_BEST_AVAIL_LEN, /* * Reads each block group sequentially, performing disk IO if * necessary, to find find_suitable block group. Tries to * allocate goal length but might trim the request if nothing * is found after enough tries. */ CR_GOAL_LEN_SLOW, /* * Finds the first free set of blocks and allocates * those. This is only used in rare cases when * CR_GOAL_LEN_SLOW also fails to allocate anything. */ CR_ANY_FREE, /* * Number of criterias defined. */ EXT4_MB_NUM_CRS }; /* * Flags used in mballoc's allocation_context flags field. * * Also used to show what's going on for debugging purposes when the * flag field is exported via the traceport interface */ /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 /* blocks already reserved */ #define EXT4_MB_HINT_RESERVED 0x0002 /* metadata is being allocated */ #define EXT4_MB_HINT_METADATA 0x0004 /* first blocks in the file */ #define EXT4_MB_HINT_FIRST 0x0008 /* search for the best chunk */ #define EXT4_MB_HINT_BEST 0x0010 /* data is being allocated */ #define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ #define EXT4_MB_HINT_NOPREALLOC 0x0040 /* allocate for locality group */ #define EXT4_MB_HINT_GROUP_ALLOC 0x0080 /* allocate goal blocks or none */ #define EXT4_MB_HINT_GOAL_ONLY 0x0100 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 0x0200 /* blocks already pre-reserved by delayed allocation */ #define EXT4_MB_DELALLOC_RESERVED 0x0400 /* We are doing stream allocation */ #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 /* Use blocks from reserved pool */ #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 /* Large fragment size list lookup succeeded at least once for cr = 0 */ #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; /* how many blocks we want to allocate */ unsigned int len; /* logical block in target inode */ ext4_lblk_t logical; /* the closest logical allocated block to the left */ ext4_lblk_t lleft; /* the closest logical allocated block to the right */ ext4_lblk_t lright; /* phys. target (a hint) */ ext4_fsblk_t goal; /* phys. block for the closest logical allocated block to the left */ ext4_fsblk_t pleft; /* phys. block for the closest logical allocated block to the right */ ext4_fsblk_t pright; /* flags. see above EXT4_MB_HINT_* */ unsigned int flags; }; /* * Logical to physical block mapping, used by ext4_map_blocks() * * This structure is used to pass requests into ext4_map_blocks() as * well as to store the information returned by ext4_map_blocks(). It * takes less room on the stack than a struct buffer_head. */ #define EXT4_MAP_NEW BIT(BH_New) #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) struct ext4_map_blocks { ext4_fsblk_t m_pblk; ext4_lblk_t m_lblk; unsigned int m_len; unsigned int m_flags; }; /* * Block validity checking, system zone rbtree. */ struct ext4_system_blocks { struct rb_root root; struct rcu_head rcu; }; /* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ }; /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ handle_t *handle; /* handle reserved for extent * conversion */ struct inode *inode; /* file being written to */ struct bio *bio; /* Linked list of completed * bios covering the extent */ unsigned int flag; /* unwritten or not */ refcount_t count; /* reference counter */ struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { struct writeback_control *io_wbc; struct bio *io_bio; ext4_io_end_t *io_end; sector_t io_next_block; }; /* * Special inodes numbers */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ #define EXT4_ROOT_INO 2 /* Root inode */ #define EXT4_USR_QUOTA_INO 3 /* User quota inode */ #define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ #define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ #define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ #define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ #define EXT4_JOURNAL_INO 8 /* Journal inode */ /* First non-reserved inode for old ext4 filesystems */ #define EXT4_GOOD_OLD_FIRST_INO 11 /* * Maximal count of links to a file */ #define EXT4_LINK_MAX 65000 /* * Macro-instructions used to manage several block sizes */ #define EXT4_MIN_BLOCK_SIZE 1024 #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 #define EXT4_MAX_BLOCK_LOG_SIZE 16 #define EXT4_MAX_CLUSTER_LOG_SIZE 30 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif #define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) #define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ EXT4_SB(s)->s_cluster_bits) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) # define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) #else # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) #endif #ifdef __KERNEL__ #define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) #define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) #define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) #else #define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ EXT4_GOOD_OLD_INODE_SIZE : \ (s)->s_inode_size) #define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ EXT4_GOOD_OLD_FIRST_INO : \ (s)->s_first_ino) #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ #define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) /* Mask out the low bits to get the starting block of the cluster */ #define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* Fill in the low bits to get the last block of the cluster */ #define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor */ struct ext4_group_desc { __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ __le32 bg_inode_table_lo; /* Inodes table block */ __le16 bg_free_blocks_count_lo;/* Free blocks count */ __le16 bg_free_inodes_count_lo;/* Free inodes count */ __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ __le32 bg_inode_table_hi; /* Inodes table block MSB */ __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ __u32 bg_reserved; }; #define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ sizeof(__le16)) #define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ sizeof(__le16)) /* * Structure of a flex block group info */ struct flex_groups { atomic64_t free_clusters; atomic_t free_inodes; atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ /* * Macro-instructions used to manage group descriptors */ #define EXT4_MIN_DESC_SIZE 32 #define EXT4_MIN_DESC_SIZE_64BIT 64 #define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE #define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) #ifdef __KERNEL__ # define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) # define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) # define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) # define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) #else # define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) # define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) #endif /* * Constants relative to the data blocks */ #define EXT4_NDIR_BLOCKS 12 #define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS #define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) #define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) #define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) /* * Inode flags */ #define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ #define EXT4_UNRM_FL 0x00000002 /* Undelete */ #define EXT4_COMPR_FL 0x00000004 /* Compress file */ #define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ #define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ #define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ #define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ /* Reserved for compression usage... */ #define EXT4_DIRTY_FL 0x00000100 #define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ /* nb: was previously EXT2_ECOMPR_FL */ #define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ /* End compression flags --- maybe not all used */ #define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ #define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ #define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ /* User modifiable flags */ #define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ EXT4_UNRM_FL | \ EXT4_COMPR_FL | \ EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ EXT4_JOURNAL_DATA_FL | \ EXT4_NOTAIL_FL | \ EXT4_DIRSYNC_FL | \ EXT4_TOPDIR_FL | \ EXT4_EXTENTS_FL | \ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ EXT4_DAX_FL | \ EXT4_PROJINHERIT_FL | \ EXT4_CASEFOLD_FL) /* User visible flags */ #define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ EXT4_DIRTY_FL | \ EXT4_COMPRBLK_FL | \ EXT4_NOCOMPR_FL | \ EXT4_ENCRYPT_FL | \ EXT4_INDEX_FL | \ EXT4_VERITY_FL | \ EXT4_INLINE_DATA_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ EXT4_PROJINHERIT_FL)) /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) /* Flags which are mutually exclusive to DAX */ #define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL) /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) return flags; else if (S_ISREG(mode)) return flags & EXT4_REG_FLMASK; else return flags & EXT4_OTHER_FLMASK; } /* * Inode flags used for atomic set/get */ enum { EXT4_INODE_SECRM = 0, /* Secure deletion */ EXT4_INODE_UNRM = 1, /* Undelete */ EXT4_INODE_COMPR = 2, /* Compress file */ EXT4_INODE_SYNC = 3, /* Synchronous updates */ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ EXT4_INODE_APPEND = 5, /* writes to file may only append */ EXT4_INODE_NODUMP = 6, /* do not dump file */ EXT4_INODE_NOATIME = 7, /* do not update atime */ /* Reserved for compression usage... */ EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; /* * Since it's pretty easy to mix up bit numbers and hex values, we use a * build-time check to make sure that EXT4_XXX_FL is consistent with respect to * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost * any extra space in the compiled kernel image, otherwise, the build will fail. * It's important that these values are the same, since we are using * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk * values found in ext2, ext3 and ext4 filesystems, and of course the values * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ #define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); CHECK_FLAG_VALUE(UNRM); CHECK_FLAG_VALUE(COMPR); CHECK_FLAG_VALUE(SYNC); CHECK_FLAG_VALUE(IMMUTABLE); CHECK_FLAG_VALUE(APPEND); CHECK_FLAG_VALUE(NODUMP); CHECK_FLAG_VALUE(NOATIME); CHECK_FLAG_VALUE(DIRTY); CHECK_FLAG_VALUE(COMPRBLK); CHECK_FLAG_VALUE(NOCOMPR); CHECK_FLAG_VALUE(ENCRYPT); CHECK_FLAG_VALUE(INDEX); CHECK_FLAG_VALUE(IMAGIC); CHECK_FLAG_VALUE(JOURNAL_DATA); CHECK_FLAG_VALUE(NOTAIL); CHECK_FLAG_VALUE(DIRSYNC); CHECK_FLAG_VALUE(TOPDIR); CHECK_FLAG_VALUE(HUGE_FILE); CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); CHECK_FLAG_VALUE(CASEFOLD); CHECK_FLAG_VALUE(RESERVED); } #if defined(__KERNEL__) && defined(CONFIG_COMPAT) struct compat_ext4_new_group_input { u32 group; compat_u64 block_bitmap; compat_u64 inode_bitmap; compat_u64 inode_table; u32 blocks_count; u16 reserved_blocks; u16 unused; }; #endif /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ struct ext4_new_group_data { __u32 group; __u64 block_bitmap; __u64 inode_bitmap; __u64 inode_table; __u32 blocks_count; __u16 reserved_blocks; __u16 mdata_blocks; __u32 free_clusters_count; }; /* Indexes used to index group tables in ext4_new_group_data */ enum { BLOCK_BITMAP = 0, /* block bitmap */ INODE_BITMAP, /* inode bitmap */ INODE_TABLE, /* inode tables */ GROUP_TABLE_COUNT, }; /* * Flags used by ext4_map_blocks() */ /* Allocate any needed blocks and/or convert an unwritten extent to be an initialized ext4 */ #define EXT4_GET_BLOCKS_CREATE 0x0001 /* Request the creation of an unwritten extent */ #define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an unwritten extents if not allocated, split the unwritten extent if blocks has been preallocated already*/ #define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 /* Write zeros to newly created written extents */ #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) /* Caller will submit data before dropping transaction handle. This * allows jbd2 to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 /* * The bit position of these flags must not overlap with any of the * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), * read_extent_tree_block(), ext4_split_extent_at(), * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be * caching the extents when reading from the extent tree while a * truncate or punch hole operation is in progress. */ #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 /* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation */ #define EXT4_IOC32_GETVERSION _IOR('f', 3, int) #define EXT4_IOC32_SETVERSION _IOW('f', 4, int) #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) #define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* Max logical block we can support */ #define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE /* * Structure of an inode on the disk */ struct ext4_inode { __le16 i_mode; /* File mode */ __le16 i_uid; /* Low 16 bits of Owner Uid */ __le32 i_size_lo; /* Size in bytes */ __le32 i_atime; /* Access time */ __le32 i_ctime; /* Inode Change time */ __le32 i_mtime; /* Modification time */ __le32 i_dtime; /* Deletion Time */ __le16 i_gid; /* Low 16 bits of Group Id */ __le16 i_links_count; /* Links count */ __le32 i_blocks_lo; /* Blocks count */ __le32 i_flags; /* File flags */ union { struct { __le32 l_i_version; } linux1; struct { __u32 h_i_translator; } hurd1; struct { __u32 m_i_reserved1; } masix1; } osd1; /* OS dependent 1 */ __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ __le32 i_generation; /* File version (for NFS) */ __le32 i_file_acl_lo; /* File ACL */ __le32 i_size_high; __le32 i_obso_faddr; /* Obsoleted fragment address */ union { struct { __le16 l_i_blocks_high; /* were l_i_reserved1 */ __le16 l_i_file_acl_high; __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ __le16 l_i_reserved; } linux2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ __u16 h_i_mode_high; __u16 h_i_uid_high; __u16 h_i_gid_high; __u32 h_i_author; } hurd2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ __le16 m_i_file_acl_high; __u32 m_i_reserved2[2]; } masix2; } osd2; /* OS dependent 2 */ __le16 i_extra_isize; __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ __le32 i_crtime; /* File Creation time */ __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ __le32 i_version_hi; /* high 32 bits for 64-bit version */ __le32 i_projid; /* Project ID */ }; #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) #define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) /* * Extended fields will fit into an inode if the filesystem was formatted * with large inodes (-I 256 or larger) and there are not currently any EAs * consuming all of the available space. For new inodes we always reserve * enough space for the kernel's known extended fields, but for inodes * created with an old kernel this might not have been the case. None of * the extended inode fields is critical for correct filesystem operation. * This macro checks if a certain field fits in the inode. Note that * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize */ #define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ ((offsetof(typeof(*ext4_inode), field) + \ sizeof((ext4_inode)->field)) \ <= (EXT4_GOOD_OLD_INODE_SIZE + \ (einode)->i_extra_isize)) \ /* * We use an encoding that preserves the times for extra epoch "00": * * extra msb of adjust for signed * epoch 32-bit 32-bit tv_sec to * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 * * Note that previous versions of the kernel on 64-bit systems would * incorrectly use extra epoch bits 1,1 for dates between 1901 and * 1970. e2fsck will correct this, assuming that it is run on the * affected filesystem before 2242. */ static inline __le32 ext4_encode_extra_time(struct timespec64 ts) { u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); } static inline struct timespec64 ext4_decode_extra_time(__le32 base, __le32 extra) { struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; return ts; } #define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ } else \ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) #define EXT4_INODE_SET_ATIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) #define EXT4_INODE_SET_MTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) #define EXT4_INODE_SET_CTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ raw_inode, (einode)->xtime) #define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ ext4_decode_extra_time((raw_inode)->xtime, \ (raw_inode)->xtime ## _extra) : \ (struct timespec64) { \ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ }) #define EXT4_INODE_GET_ATIME(inode, raw_inode) \ do { \ inode_set_atime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_MTIME(inode, raw_inode) \ do { \ inode_set_mtime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_CTIME(inode, raw_inode) \ do { \ inode_set_ctime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ } while (0) #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ (einode)->xtime = \ EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ raw_inode); \ else \ (einode)->xtime = (struct timespec64){0, 0}; \ } while (0) #define i_disk_version osd1.linux1.l_i_version #if defined(__KERNEL__) || defined(__linux__) #define i_reserved1 osd1.linux1.l_i_reserved1 #define i_file_acl_high osd2.linux2.l_i_file_acl_high #define i_blocks_high osd2.linux2.l_i_blocks_high #define i_uid_low i_uid #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high #define i_checksum_lo osd2.linux2.l_i_checksum_lo #elif defined(__GNU__) #define i_translator osd1.hurd1.h_i_translator #define i_uid_high osd2.hurd2.h_i_uid_high #define i_gid_high osd2.hurd2.h_i_gid_high #define i_author osd2.hurd2.h_i_author #elif defined(__masix__) #define i_reserved1 osd1.masix1.m_i_reserved1 #define i_file_acl_high osd2.masix2.m_i_file_acl_high #define i_reserved2 osd2.masix2.m_i_reserved2 #endif /* defined(__KERNEL__) || defined(__linux__) */ #include "extents_status.h" #include "fast_commit.h" /* * Lock subclasses for i_data_sem in the ext4_inode_info structure. * * These are needed to avoid lockdep false positives when we need to * allocate blocks to the quota inode during ext4_map_blocks(), while * holding i_data_sem for a normal (non-quota) inode. Since we don't * do quota tracking for the quota inode, this avoids deadlock (as * well as infinite recursion, since it isn't turtles all the way * down...) * * I_DATA_SEM_NORMAL - Used for most inodes * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode * where the second inode has larger inode number * than the first * I_DATA_SEM_QUOTA - Used for quota inodes only * I_DATA_SEM_EA - Used for ea_inodes only */ enum { I_DATA_SEM_NORMAL = 0, I_DATA_SEM_OTHER, I_DATA_SEM_QUOTA, I_DATA_SEM_EA }; /* * fourth extended file system inode data in memory */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ __u32 i_dtime; ext4_fsblk_t i_file_acl; /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, * it is used for making block allocation decisions - we try to * place a file's data blocks near its inode block, and new inodes * near to their parent directory's inode. */ ext4_group_t i_block_group; ext4_lblk_t i_dir_start_lookup; #if (BITS_PER_LONG < 64) unsigned long i_state_flags; /* Dynamic state flags */ #endif unsigned long i_flags; /* * Extended attributes can be read independently of the main file * data. Taking i_rwsem even when reading would cause contention * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. */ struct rw_semaphore xattr_sem; /* * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise * i_orphan is used. */ union { struct list_head i_orphan; /* unlinked but open inodes */ unsigned int i_orphan_idx; /* Index in orphan file */ }; /* Fast commit related info */ /* For tracking dentry create updates */ struct list_head i_fc_dilist; struct list_head i_fc_list; /* * inodes that need fast commit * protected by sbi->s_fc_lock. */ /* Start of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_start; /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; /* Number of ongoing updates on this inode */ atomic_t i_fc_updates; /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ struct mutex i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not * in memory. During truncate, i_size is set to the new size by * the VFS prior to calling ext4_truncate(), but the filesystem won't * set i_disksize to 0 until the truncate is actually under way. * * The intent is that i_disksize always represents the blocks which * are used by this file. This allows recovery to restart truncate * on orphans if we crash during truncate. We actually write i_disksize * into the on-disk inode when writing inodes out, instead of i_size. * * The only time when i_disksize and i_size may be different is when * a truncate is in progress. The only things which change i_disksize * are ext4_get_block (growth) and ext4_truncate (shrinkth). */ loff_t i_disksize; /* * i_data_sem is for serialising ext4_truncate() against * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's * data tree are chopped off during truncate. We can't do that in * ext4 because whenever we perform intermediate commits during * truncate, the inode and all the metadata blocks *must* be in a * consistent state which allows truncation of the orphans to restart * during recovery. Hence we must fix the get_block-vs-truncate race * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; struct inode vfs_inode; struct jbd2_inode *jinode; spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. */ struct timespec64 i_crtime; /* mballoc */ atomic_t i_prealloc_active; struct rb_root i_prealloc_node; rwlock_t i_prealloc_lock; /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; struct list_head i_es_list; unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_shk_nr; /* protected by i_es_lock */ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; /* allocation reservation info for delalloc */ /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; /* on-disk additional length */ __u16 i_extra_isize; /* Indicate the inline data space. */ u16 i_inline_off; u16 i_inline_size; #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif /* Lock protecting lists below */ spinlock_t i_completed_io_lock; /* * Completed IOs that need unwritten extents handling and have * transaction reserved */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. */ tid_t i_sync_tid; tid_t i_datasync_tid; #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; kprojid_t i_projid; }; /* * File system states */ #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT4_ERROR_FS 0x0002 /* Errors detected */ #define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ #define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ /* * Misc. filesystem flags */ #define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ #define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ #define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX #define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else #define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ #define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ #define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ #define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ #define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ #define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, * enable enforcement for hidden * quota files */ #define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable * enforcement for hidden quota * files */ #define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ #define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ /* * Mount flags set either automatically (could not be set by mount option) * based on per file system feature or property or in special cases such as * distinguishing between explicit mount option definition and default. */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group size of blocksize * 8 blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ #define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ EXT4_MOUNT_##opt #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) #define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ ~EXT4_MOUNT2_##opt #define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ EXT4_MOUNT2_##opt #define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ EXT4_MOUNT2_##opt) #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le extern void mb_set_bits(void *bm, int cur, int len); /* * Maximal mount counts between two filesystem checks */ #define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ #define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ /* * Behaviour when detecting errors */ #define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ #define EXT4_ERRORS_RO 2 /* Remount fs read-only */ #define EXT4_ERRORS_PANIC 3 /* Panic */ #define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 #define EXT4_LABEL_MAX 16 /* * Structure of the super block */ struct ext4_super_block { /*00*/ __le32 s_inodes_count; /* Inodes count */ __le32 s_blocks_count_lo; /* Blocks count */ __le32 s_r_blocks_count_lo; /* Reserved blocks count */ __le32 s_free_blocks_count_lo; /* Free blocks count */ /*10*/ __le32 s_free_inodes_count; /* Free inodes count */ __le32 s_first_data_block; /* First Data Block */ __le32 s_log_block_size; /* Block size */ __le32 s_log_cluster_size; /* Allocation cluster size */ /*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ __le32 s_clusters_per_group; /* # Clusters per group */ __le32 s_inodes_per_group; /* # Inodes per group */ __le32 s_mtime; /* Mount time */ /*30*/ __le32 s_wtime; /* Write time */ __le16 s_mnt_count; /* Mount count */ __le16 s_max_mnt_count; /* Maximal mount count */ __le16 s_magic; /* Magic signature */ __le16 s_state; /* File system state */ __le16 s_errors; /* Behaviour when detecting errors */ __le16 s_minor_rev_level; /* minor revision level */ /*40*/ __le32 s_lastcheck; /* time of last check */ __le32 s_checkinterval; /* max. time between checks */ __le32 s_creator_os; /* OS */ __le32 s_rev_level; /* Revision level */ /*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ __le16 s_def_resgid; /* Default gid for reserved blocks */ /* * These fields are for EXT4_DYNAMIC_REV superblocks only. * * Note: the difference between the compatible feature set and * the incompatible feature set is that if there is a bit set * in the incompatible feature set that the kernel doesn't * know about, it should refuse to mount the filesystem. * * e2fsck's requirements are more strict; if it doesn't know * about a feature in either the compatible or incompatible * feature set, it must abort and not try to meddle with * things it doesn't understand... */ __le32 s_first_ino; /* First non-reserved inode */ __le16 s_inode_size; /* size of inode structure */ __le16 s_block_group_nr; /* block group # of this superblock */ __le32 s_feature_compat; /* compatible feature set */ /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ /*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* * Performance hints. Directory preallocation should only * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. */ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ /* * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. */ /*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ /*E0*/ __le32 s_journal_inum; /* inode number of journal file */ __le32 s_journal_dev; /* device number of journal file */ __le32 s_last_orphan; /* start of list of inodes to delete */ __le32 s_hash_seed[4]; /* HTREE hash seed */ __u8 s_def_hash_version; /* Default hash version to use */ __u8 s_jnl_backup_type; __le16 s_desc_size; /* size of group descriptor */ /*100*/ __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ __le32 s_mkfs_time; /* When the filesystem was created */ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */ /*150*/ __le32 s_blocks_count_hi; /* Blocks count */ __le32 s_r_blocks_count_hi; /* Reserved blocks count */ __le32 s_free_blocks_count_hi; /* Free blocks count */ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le32 s_flags; /* Miscellaneous flags */ __le16 s_raid_stride; /* RAID stride */ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_checksum_type; /* metadata checksum algorithm used */ __u8 s_encryption_level; /* versioning level for encryption */ __u8 s_reserved_pad; /* Padding to next 32bits */ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ __le32 s_snapshot_id; /* sequential ID of active snapshot */ __le64 s_snapshot_r_blocks_count; /* reserved blocks for active snapshot's future use */ __le32 s_snapshot_list; /* inode number of the head of the on-disk snapshot list */ #define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) __le32 s_error_count; /* number of fs errors */ __le32 s_first_error_time; /* first time an error happened */ __le32 s_first_error_ino; /* inode involved in first error */ __le64 s_first_error_block; /* block involved of first error */ __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ __le32 s_first_error_line; /* line number where error happened */ __le32 s_last_error_time; /* most recent time of an error */ __le32 s_last_error_ino; /* inode involved in last error */ __le32 s_last_error_line; /* line number where error happened */ __le64 s_last_error_block; /* block involved of last error */ __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ __le32 s_lpf_ino; /* Location of the lost+found inode */ __le32 s_prj_quota_inum; /* inode for tracking project quota */ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ __u8 s_wtime_hi; __u8 s_mtime_hi; __u8 s_mkfs_time_hi; __u8 s_lastcheck_hi; __u8 s_first_error_time_hi; __u8 s_last_error_time_hi; __u8 s_first_error_errcode; __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ __le32 s_reserved[94]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) #ifdef __KERNEL__ /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 #define EXT4_ENC_UTF8_12_1 1 /* Types of ext4 journal triggers */ enum ext4_journal_trigger_type { EXT4_JTR_ORPHAN_FILE, EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ }; #define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE struct ext4_journal_trigger { struct jbd2_buffer_trigger_type tr_triggers; struct super_block *sb; }; static inline struct ext4_journal_trigger *EXT4_TRIGGER( struct jbd2_buffer_trigger_type *trigger) { return container_of(trigger, struct ext4_journal_trigger, tr_triggers); } #define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 /* Structure at the tail of orphan block */ struct ext4_orphan_block_tail { __le32 ob_magic; __le32 ob_checksum; }; static inline int ext4_inodes_per_orphan_block(struct super_block *sb) { return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / sizeof(u32); } struct ext4_orphan_block { atomic_t ob_free_entries; /* Number of free orphan entries in block */ struct buffer_head *ob_bh; /* Buffer for orphan block */ }; /* * Info about orphan file. */ struct ext4_orphan_info { int of_blocks; /* Number of orphan blocks in a file */ __u32 of_csum_seed; /* Checksum seed for orphan file */ struct ext4_orphan_block *of_binfo; /* Array with info about orphan * file blocks */ }; /* * fourth extended-fs super-block data in memory */ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ unsigned long s_inodes_per_block;/* Number of inodes per block */ unsigned long s_blocks_per_group;/* Number of blocks in a group */ unsigned long s_clusters_per_group; /* Number of clusters in a group */ unsigned long s_inodes_per_group;/* Number of inodes in a group */ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ unsigned long s_gdb_count; /* Number of group descriptor blocks */ unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead; /* # of fs overhead clusters */ unsigned int s_cluster_ratio; /* Number of blocks per cluster */ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ /* Array of bh's for the block group descriptors */ struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned long s_mount_flags; unsigned int s_def_mount_opt; unsigned int s_def_mount_opt2; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; kuid_t s_resuid; kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; int s_desc_per_block_bits; int s_inode_size; int s_first_ino; unsigned int s_inode_readahead_blks; unsigned int s_inode_goal; u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyclusters_counter; struct percpu_counter s_sra_exceeded_retry_limit; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; struct buffer_head *s_mmp_bh; /* Journaling */ struct journal_s *s_journal; unsigned long s_ext4_flags; /* Ext4 superblock flags */ struct mutex s_orphan_lock; /* Protects on disk list changes */ struct list_head s_orphan; /* List of orphaned inodes in on disk list */ struct ext4_orphan_info s_orphan_info; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; struct bdev_handle *s_journal_bdev_handle; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ struct ext4_system_blocks __rcu *s_system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ unsigned long s_ext_min; unsigned long s_ext_max; unsigned long s_depth_max; spinlock_t s_ext_stats_lock; unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif /* for buddy allocator */ struct ext4_group_info ** __rcu *s_group_info; struct inode *s_buddy_cache; spinlock_t s_md_lock; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; unsigned int s_mb_free_pending; struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; struct list_head *s_mb_avg_fragment_size; rwlock_t *s_mb_avg_fragment_size_locks; struct list_head *s_mb_largest_free_orders; rwlock_t *s_mb_largest_free_orders_locks; /* tunables */ unsigned long s_stripe; unsigned int s_mb_max_linear_groups; unsigned int s_mb_stream_request; unsigned int s_mb_max_to_scan; unsigned int s_mb_min_to_scan; unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_p2_aligned_bad_suggestions; atomic_t s_bal_goal_fast_bad_suggestions; atomic_t s_bal_best_avail_bad_suggestions; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; atomic_t s_lock_busy; /* locality groups */ struct ext4_locality_group __percpu *s_locality_groups; /* for write statistics */ unsigned long s_sectors_written_start; u64 s_kbytes_written; /* the size of zero-out chunk */ unsigned int s_extent_max_zeroout_kb; unsigned int s_log_groups_per_flex; struct flex_groups * __rcu *s_flex_groups; ext4_group_t s_flex_groups_allocated; /* workqueue for reserved extent conversions (buffered io) */ struct workqueue_struct *rsv_conversion_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; /* Lazy inode table initialization info */ struct ext4_li_request *s_li_request; /* Wait multiplier for lazy initialization thread */ unsigned int s_li_wait_mult; /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; /* Reclaim extents from extent status tree */ struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; struct mb_cache *s_ea_block_cache; struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Journal triggers for checksum computation */ struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; atomic_t s_warning_count; atomic_t s_msg_count; /* Encryption policy for '-o test_dummy_encryption' */ struct fscrypt_dummy_policy s_dummy_enc_policy; /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag or between writepages ops and changing DELALLOC or * DIOREAD_NOLOCK mount options on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; u64 s_dax_part_off; #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif /* Record the errseq of the backing block device */ errseq_t s_bdev_wb_err; spinlock_t s_bdev_wb_lock; /* Information about errors that happened during this mount */ spinlock_t s_error_lock; int s_add_error_count; int s_first_error_code; __u32 s_first_error_line; __u32 s_first_error_ino; __u64 s_first_error_block; const char *s_first_error_func; time64_t s_first_error_time; int s_last_error_code; __u32 s_last_error_line; __u32 s_last_error_ino; __u64 s_last_error_block; const char *s_last_error_func; time64_t s_last_error_time; /* * If we are in a context where we cannot update the on-disk * superblock, we queue the work here. This is used to update * the error information in the superblock, and for periodic * updates of the superblock called from the commit callback * function. */ struct work_struct s_sb_upd_work; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; /* * After commit starts, the main queue gets locked, and the further * updates get added in the staging queue. */ #define FC_Q_MAIN 0 #define FC_Q_STAGING 1 struct list_head s_fc_q[2]; /* Inodes staged for fast commit * that have data changes in them. */ struct list_head s_fc_dentry_q[2]; /* directory entry updates */ unsigned int s_fc_bytes; /* * Main fast commit lock. This lock protects accesses to the * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif struct ext4_fc_replay_state s_fc_replay_state; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct ext4_inode_info *EXT4_I(struct inode *inode) { return container_of(inode, struct ext4_inode_info, vfs_inode); } static inline int ext4_writepages_down_read(struct super_block *sb) { percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); return memalloc_nofs_save(); } static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); } static inline int ext4_writepages_down_write(struct super_block *sb) { percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); return memalloc_nofs_save(); } static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); } static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } /* * Returns: sbi->field[index] * Used to access an array element from the following sbi fields which require * rcu protection to avoid dereferencing an invalid pointer due to reassignment * - s_group_desc * - s_group_info * - s_flex_group */ #define sbi_array_rcu_deref(sbi, field, index) \ ({ \ typeof(*((sbi)->field)) _v; \ rcu_read_lock(); \ _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ rcu_read_unlock(); \ _v; \ }) /* * run-time mount flags */ enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) { set_bit(bit, &EXT4_SB(sb)->s_mount_flags); } static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) { clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); } static inline int ext4_test_mount_flag(struct super_block *sb, int bit) { return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); } /* * Simulate_fail codes */ #define EXT4_SIM_BBITMAP_EIO 1 #define EXT4_SIM_BBITMAP_CRC 2 #define EXT4_SIM_IBITMAP_EIO 3 #define EXT4_SIM_IBITMAP_CRC 4 #define EXT4_SIM_INODE_EIO 5 #define EXT4_SIM_INODE_CRC 6 #define EXT4_SIM_DIRBLOCK_EIO 7 #define EXT4_SIM_DIRBLOCK_CRC 8 static inline bool ext4_simulate_fail(struct super_block *sb, unsigned long code) { #ifdef CONFIG_EXT4_DEBUG struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(sbi->s_simulate_fail == code)) { sbi->s_simulate_fail = 0; return true; } #endif return false; } static inline void ext4_simulate_fail_bh(struct super_block *sb, struct buffer_head *bh, unsigned long code) { if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) clear_buffer_uptodate(bh); } /* * Error number codes for s_{first,last}_error_errno * * Linux errno numbers are architecture specific, so we need to translate * them into something which is architecture independent. We don't define * codes for all errno's; just the ones which are most likely to be the cause * of an ext4_error() call. */ #define EXT4_ERR_UNKNOWN 1 #define EXT4_ERR_EIO 2 #define EXT4_ERR_ENOMEM 3 #define EXT4_ERR_EFSBADCRC 4 #define EXT4_ERR_EFSCORRUPTED 5 #define EXT4_ERR_ENOSPC 6 #define EXT4_ERR_ENOKEY 7 #define EXT4_ERR_EROFS 8 #define EXT4_ERR_EFBIG 9 #define EXT4_ERR_EEXIST 10 #define EXT4_ERR_ERANGE 11 #define EXT4_ERR_EOVERFLOW 12 #define EXT4_ERR_EBUSY 13 #define EXT4_ERR_ENOTDIR 14 #define EXT4_ERR_ENOTEMPTY 15 #define EXT4_ERR_ESHUTDOWN 16 #define EXT4_ERR_EFAULT 17 /* * Inode dynamic state flags */ enum { EXT4_STATE_NEW, /* inode is newly created */ EXT4_STATE_XATTR, /* has in-inode xattrs */ EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ { \ return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ { \ set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ { \ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } /* Add these declarations here only so that these functions can be * found by name. Otherwise, they are very hard to locate. */ static inline int ext4_test_inode_flag(struct inode *inode, int bit); static inline void ext4_set_inode_flag(struct inode *inode, int bit); static inline void ext4_clear_inode_flag(struct inode *inode, int bit); EXT4_INODE_BIT_FNS(flag, flags, 0) /* Add these declarations here only so that these functions can be * found by name. Otherwise, they are very hard to locate. */ static inline int ext4_test_inode_state(struct inode *inode, int bit); static inline void ext4_set_inode_state(struct inode *inode, int bit); static inline void ext4_clear_inode_state(struct inode *inode, int bit); #if (BITS_PER_LONG < 64) EXT4_INODE_BIT_FNS(state, state_flags, 0) static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) { (ei)->i_state_flags = 0; } #else EXT4_INODE_BIT_FNS(state, flags, 32) static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) { /* We depend on the fact that callers will set i_flags */ } #endif #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test * macros from user land. */ #define EXT4_SB(sb) (sb) #endif static inline bool ext4_verity_in_progress(struct inode *inode) { return IS_ENABLED(CONFIG_FS_VERITY) && ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); } #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* * Codes for operating systems */ #define EXT4_OS_LINUX 0 #define EXT4_OS_HURD 1 #define EXT4_OS_MASIX 2 #define EXT4_OS_FREEBSD 3 #define EXT4_OS_LITES 4 /* * Revision levels */ #define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ #define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ #define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV #define EXT4_GOOD_OLD_INODE_SIZE 128 #define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) #define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX #define EXT4_TIMESTAMP_MIN S32_MIN /* * Feature set definitions */ #define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 #define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 /* * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes * incompatible only if fast commit blocks are present in the FS. Since we * clear the journal (and thus the fast commit blocks), we don't mark FS as * incompatible. We also have a JBD2 incompat feature, which gets set when * there are fast commit blocks present in the journal. */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 #define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 #define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 /* * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When * METADATA_CSUM is set, group descriptor checksums use the same algorithm as * all other data structures' checksums. However, the METADATA_CSUM and * GDT_CSUM bits are mutually exclusive. */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 #define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be non-empty */ #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 #define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ #define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 #define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 #define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 extern void ext4_update_dynamic_rev(struct super_block *sb); #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_compat & \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_compat &= \ ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } #define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } #define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_incompat &= \ ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_META_BG) #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) #define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) #define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ EXT4_FEATURE_COMPAT_ORPHAN_FILE) #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ EXT4_FEATURE_INCOMPAT_CASEFOLD | \ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ EXT4_FEATURE_RO_COMPAT_PROJECT |\ EXT4_FEATURE_RO_COMPAT_VERITY |\ EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_compat & \ cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ } \ static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ } \ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ } EXTN_FEATURE_FUNCS(2) EXTN_FEATURE_FUNCS(3) EXTN_FEATURE_FUNCS(4) static inline bool ext4_has_compat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_compat != 0); } static inline bool ext4_has_ro_compat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); } static inline bool ext4_has_incompat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); } extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ #define EXT4_FLAGS_RESIZING 0 #define EXT4_FLAGS_SHUTDOWN 1 #define EXT4_FLAGS_BDEV_IS_DAX 2 static inline int ext4_forced_shutdown(struct super_block *sb) { return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } /* * Default values for user and/or group using reserved blocks */ #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 /* * Default project ID */ #define EXT4_DEF_PROJID 0 #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* * Default mount options */ #define EXT4_DEFM_DEBUG 0x0001 #define EXT4_DEFM_BSDGROUPS 0x0002 #define EXT4_DEFM_XATTR_USER 0x0004 #define EXT4_DEFM_ACL 0x0008 #define EXT4_DEFM_UID16 0x0010 #define EXT4_DEFM_JMODE 0x0060 #define EXT4_DEFM_JMODE_DATA 0x0020 #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 #define EXT4_DEFM_NOBARRIER 0x0100 #define EXT4_DEFM_BLOCK_VALIDITY 0x0200 #define EXT4_DEFM_DISCARD 0x0400 #define EXT4_DEFM_NODELALLOC 0x0800 /* * Default journal batch times */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ /* * Minimum number of groups in a flexgroup before we separate out * directories into the first block group of a flexgroup */ #define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 /* * Structure of a directory entry */ #define EXT4_NAME_LEN 255 /* * Base length of the ext4 directory entry excluding the name length */ #define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __le16 name_len; /* Name length */ char name[EXT4_NAME_LEN]; /* File name */ }; /* * Encrypted Casefolded entries require saving the hash on disk. This structure * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned * boundary. */ struct ext4_dir_entry_hash { __le32 hash; __le32 minor_hash; }; /* * The new version of the directory entry. Since EXT4 structures are * stored in intel byte order, and the name_len field could never be * bigger than 255 chars, it's safe to reclaim the extra byte for the * file_type field. */ struct ext4_dir_entry_2 { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ __u8 file_type; /* See file type macros EXT4_FT_* below */ char name[EXT4_NAME_LEN]; /* File name */ }; /* * Access the hashes at the end of ext4_dir_entry_2 */ #define EXT4_DIRENT_HASHES(entry) \ ((struct ext4_dir_entry_hash *) \ (((void *)(entry)) + \ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) #define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) #define EXT4_DIRENT_MINOR_HASH(entry) \ le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) static inline bool ext4_hash_in_dirent(const struct inode *inode) { return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); } /* * This is a bogus directory entry at the end of each leaf block that * records checksums. */ struct ext4_dir_entry_tail { __le32 det_reserved_zero1; /* Pretend to be unused */ __le16 det_rec_len; /* 12 */ __u8 det_reserved_zero2; /* Zero name length */ __u8 det_reserved_ft; /* 0xDE, fake file type */ __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; #define EXT4_DIRENT_TAIL(block, blocksize) \ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ ((blocksize) - \ sizeof(struct ext4_dir_entry_tail)))) /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. */ #define EXT4_FT_UNKNOWN 0 #define EXT4_FT_REG_FILE 1 #define EXT4_FT_DIR 2 #define EXT4_FT_CHRDEV 3 #define EXT4_FT_BLKDEV 4 #define EXT4_FT_FIFO 5 #define EXT4_FT_SOCK 6 #define EXT4_FT_SYMLINK 7 #define EXT4_FT_MAX 8 #define EXT4_FT_DIR_CSUM 0xDE /* * EXT4_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 */ #define EXT4_DIR_PAD 4 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) #define EXT4_MAX_REC_LEN ((1<<16)-1) /* * The rec_len is dependent on the type of directory. Directories that are * casefolded and encrypted need to store the hash as well, so we add room for * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should * pass NULL for dir, as those entries do not use the extra fields. */ static inline unsigned int ext4_dir_rec_len(__u8 name_len, const struct inode *dir) { int rec_len = (name_len + 8 + EXT4_DIR_ROUND); if (dir && ext4_hash_in_dirent(dir)) rec_len += sizeof(struct ext4_dir_entry_hash); return (rec_len & ~EXT4_DIR_ROUND); } /* * If we ever get support for fs block sizes > page_size, we'll need * to remove the #if statements in the next two functions... */ static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); #if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); #else return len; #endif } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); #if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { if (blocksize == 65536) return cpu_to_le16(EXT4_MAX_REC_LEN); else return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); #else return cpu_to_le16(len); #endif } /* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 */ #define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) #define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) /* Legal values for the dx_root hash_version field: */ #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 #define DX_HASH_LEGACY_UNSIGNED 3 #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 #define DX_HASH_SIPHASH 6 static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, const void *address, unsigned int length) { struct { struct shash_desc shash; char ctx[4]; } desc; BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx)); desc.shash.tfm = sbi->s_chksum_driver; *(u32 *)desc.ctx = crc; BUG_ON(crypto_shash_update(&desc.shash, address, length)); return *(u32 *)desc.ctx; } #ifdef __KERNEL__ /* hash info structure used by the directory hash */ struct dx_hash_info { u32 hash; u32 minor_hash; int hash_version; u32 *seed; }; /* 32 and 64 bit signed EOF for dx directories */ #define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) #define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) /* * Control parameters used by ext4_htree_next_block */ #define HASH_NB_ALWAYS 1 struct ext4_filename { const struct qstr *usr_fname; struct fscrypt_str disk_name; struct dx_hash_info hinfo; #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) struct fscrypt_str cf_name; #endif }; #define fname_name(p) ((p)->disk_name.name) #define fname_usr_name(p) ((p)->usr_fname->name) #define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory */ struct ext4_iloc { struct buffer_head *bh; unsigned long offset; ext4_group_t block_group; }; static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) { return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } static inline bool ext4_is_quota_file(struct inode *inode) { return IS_NOQUOTA(inode) && !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); } /* * This structure is stuffed into the struct file's private_data field * for directories. It is where we put information so that we can do * readdir operations in hash tree order. */ struct dir_private_info { struct rb_root root; struct rb_node *curr_node; struct fname *extra_fname; loff_t last_pos; __u32 curr_hash; __u32 curr_minor_hash; __u32 next_hash; }; /* calculate the first block number of the group */ static inline ext4_fsblk_t ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) { return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); } /* * Special error return code only used by dx_probe() and its callers. */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) /* htree levels for ext4 */ #define EXT4_HTREE_LEVEL_COMPAT 2 #define EXT4_HTREE_LEVEL 3 static inline int ext4_dir_htree_level(struct super_block *sb) { return ext4_has_feature_largedir(sb) ? EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; } /* * Timeout and state flag for lazy initialization inode thread. */ #define EXT4_DEF_LI_WAIT_MULT 10 #define EXT4_DEF_LI_MAX_START_DELAY 5 #define EXT4_LAZYINIT_QUIT 0x0001 #define EXT4_LAZYINIT_RUNNING 0x0002 /* * Lazy inode table initialization info */ struct ext4_lazy_init { unsigned long li_state; struct list_head li_request_list; struct mutex li_list_mtx; }; enum ext4_li_mode { EXT4_LI_MODE_PREFETCH_BBITMAP, EXT4_LI_MODE_ITABLE, }; struct ext4_li_request { struct super_block *lr_super; enum ext4_li_mode lr_mode; ext4_group_t lr_first_not_zeroed; ext4_group_t lr_next_group; struct list_head lr_request; unsigned long lr_next_sched; unsigned long lr_timeout; }; struct ext4_features { struct kobject f_kobj; struct completion f_kobj_unregister; }; /* * This structure will be used for multiple mount protection. It will be * written into the block number saved in the s_mmp_block field in the * superblock. Programs that check MMP should assume that if * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe * to use the filesystem, regardless of how old the timestamp is. */ #define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ #define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ #define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ #define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ struct mmp_struct { __le32 mmp_magic; /* Magic number for MMP */ __le32 mmp_seq; /* Sequence no. updated periodically */ /* * mmp_time, mmp_nodename & mmp_bdevname are only used for information * purposes and do not affect the correctness of the algorithm */ __le64 mmp_time; /* Time last updated */ char mmp_nodename[64]; /* Node which last updated MMP block */ char mmp_bdevname[32]; /* Bdev which last updated MMP block */ /* * mmp_check_interval is used to verify if the MMP block has been * updated on the block device. The value is updated based on the * maximum time to write the MMP block during an update cycle. */ __le16 mmp_check_interval; __le16 mmp_pad1; __le32 mmp_pad2[226]; __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ }; /* arguments passed to the mmp thread */ struct mmpd_data { struct buffer_head *bh; /* bh from initial read_mmp_block() */ struct super_block *sb; /* super block of the fs */ }; /* * Check interval multiplier * The MMP block is written every update interval and initially checked every * update interval x the multiplier (the value is then adapted based on the * write latency). The reason is that writes can be delayed under load and we * don't want readers to incorrectly assume that the filesystem is no longer * in use. */ #define EXT4_MMP_CHECK_MULT 2UL /* * Minimum interval for MMP checking in seconds. */ #define EXT4_MMP_MIN_CHECK_INTERVAL 5UL /* * Maximum interval for MMP checking in seconds. */ #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL /* * Function prototypes */ /* * Ok, these declarations are also in <linux/kernel.h> but none of the * ext4 source programs needs to include it so they are duplicated here. */ # define NORET_TYPE /**/ # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); /* balloc.c */ extern void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp); extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked); extern int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh); extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname); #endif /* ext4 encryption related stuff goes here crypto.c */ #ifdef CONFIG_FS_ENCRYPTION extern const struct fscrypt_operations ext4_cryptops; int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname); int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname); void ext4_fname_free_filename(struct ext4_filename *fname); int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); #else /* !CONFIG_FS_ENCRYPTION */ static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { int err = 0; fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif return err; } static inline int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname) { return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); } static inline void ext4_fname_free_filename(struct ext4_filename *fname) { #if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif } static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } #endif /* !CONFIG_FS_ENCRYPTION */ /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, struct buffer_head *, char *, int, unsigned int); #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!ext4_has_feature_dir_index(inode->i_sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { /* ext4_iget() should have caught this... */ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; static inline unsigned char get_dtype(struct super_block *sb, int filetype) { if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) return DT_UNKNOWN; return ext4_filetype_table[filetype]; } extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); /* hash.c */ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks); #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ (goal), (owner), i_flags, 0, 0, 0) #define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ type, nblocks) \ __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* fast_commit.c */ int ext4_fc_info_show(struct seq_file *seq, void *v); void ext4_fc_init(struct super_block *sb, journal_t *journal); void ext4_fc_init_inode(struct inode *inode); void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry); void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); int ext4_fc_commit(journal_t *journal, tid_t commit_tid); int __init ext4_fc_init_dentry_cache(void); void ext4_fc_destroy_dentry_cache(void); int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt); extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); extern int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state); static inline bool ext4_mb_cr_expensive(enum criteria cr) { return cr >= CR_GOAL_LEN_SLOW; } /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line); #define ext4_iget(sb, ino, flags) \ __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern u32 ext4_dio_alignment(struct inode *inode); extern int ext4_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); int ext4_force_shutdown(struct super_block *sb, u32 flags); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode); extern int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); extern int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, int buf_size, int csum_size); extern bool ext4_empty_dir(struct inode *inode); /* resize.c */ extern void ext4_kvfree_array_rcu(void *to_free); extern int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input); extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); extern unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, unsigned int *five, unsigned int *seven); /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io); extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, int, __u64, const char *, ...); extern __printf(6, 7) void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, int, const char *, ...); extern __printf(5, 6) void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); extern __printf(4, 5) void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...); extern __printf(3, 4) void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, unsigned long, ext4_fsblk_t, const char *, ...); #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) #define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) #define ext4_error_inode_block(inode, block, err, fmt, a...) \ __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ (fmt), ## a) #define EXT4_ERROR_FILE(file, block, fmt, a...) \ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) #define ext4_abort(sb, err, fmt, a...) \ __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) #define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ __ext4_error_inode((inode), (func), (line), (block), \ (err), (fmt), ##__VA_ARGS__) #define ext4_error_file(file, func, line, block, fmt, ...) \ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) #define ext4_error(sb, fmt, ...) \ __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ ##__VA_ARGS__) #define ext4_error_err(sb, err, fmt, ...) \ __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning_inode(inode, fmt, ...) \ __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_msg(sb, level, fmt, ...) \ __ext4_msg(sb, level, fmt, ##__VA_ARGS__) #define dump_mmp_msg(sb, mmp, msg) \ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ fmt, ##__VA_ARGS__) #else #define ext4_error_inode(inode, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_inode(inode, "", 0, block, 0, " "); \ } while (0) #define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_inode(inode, "", 0, block, err, " "); \ } while (0) #define ext4_error_file(file, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_file(file, "", 0, block, " "); \ } while (0) #define ext4_error(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error(sb, "", 0, false, 0, 0, " "); \ } while (0) #define ext4_error_err(sb, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error(sb, "", 0, false, err, 0, " "); \ } while (0) #define ext4_warning(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning(sb, "", 0, " "); \ } while (0) #define ext4_warning_inode(inode, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning_inode(inode, "", 0, " "); \ } while (0) #define ext4_msg(sb, level, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_msg(sb, "", " "); \ } while (0) #define dump_mmp_msg(sb, mmp, msg) \ __dump_mmp_msg(sb, mmp, "", 0, "") #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ } while (0) #endif extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg); extern void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); static inline int ext4_has_metadata_csum(struct super_block *sb) { WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && !EXT4_SB(sb)->s_chksum_driver); return ext4_has_feature_metadata_csum(sb) && (EXT4_SB(sb)->s_chksum_driver != NULL); } static inline int ext4_has_group_desc_csum(struct super_block *sb) { return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \ ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \ le32_to_cpu(es->name##_lo)) static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_blocks_count); } static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_r_blocks_count); } static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_free_blocks_count); } static inline void ext4_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_blocks_count_lo = cpu_to_le32((u32)blk); es->s_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline loff_t ext4_isize(struct super_block *sb, struct ext4_inode *raw_inode) { if (ext4_has_feature_largedir(sb) || S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) { raw_inode->i_size_lo = cpu_to_le32(i_size); raw_inode->i_size_high = cpu_to_le32(i_size >> 32); } /* * Reading s_groups_count requires using smp_rmb() afterwards. See * the locking protocol documented in the comments of ext4_group_add() * in resize.c */ static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) { ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; smp_rmb(); return ngroups; } static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, ext4_group_t block_group) { return block_group >> sbi->s_log_groups_per_flex; } static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) { return 1 << sbi->s_log_groups_per_flex; } #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ __ext4_std_error((sb), __func__, __LINE__, (errno)); \ } while (0) #ifdef CONFIG_SMP /* Each CPU can accumulate percpu_counter_batch clusters in their local * counters. So we need to make sure we have free clusters more * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ #define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else #define EXT4_FREECLUSTERS_WATERMARK 0 #endif /* Update i_disksize. Requires i_rwsem to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); up_write(&EXT4_I(inode)->i_data_sem); } /* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) { int changed = 0; if (newsize > inode->i_size) { i_size_write(inode, newsize); changed = 1; } if (newsize > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, newsize); changed |= 2; } return changed; } int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len); struct ext4_group_info { unsigned long bb_state; #ifdef AGGRESSIVE_CHECK unsigned long bb_check_counter; #endif struct rb_root bb_free_root; ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ int bb_avg_fragment_size_order; /* order of average fragment in BG */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; struct list_head bb_avg_fragment_size_node; struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means * 5 free 8-block regions. */ }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_SET_TRIMMED(grp) \ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, ext4_group_t group) { return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); } /* * Returns true if the filesystem is busy enough that attempts to * access the block group locks has run into contention. */ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) { return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); } static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { spinlock_t *lock = ext4_group_lock_ptr(sb, group); if (spin_trylock(lock)) /* * We're able to grab the lock right away, so drop the * lock contention counter. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); else { /* * The lock is busy, so bump the contention counter, * and then wait on the spin lock. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, EXT4_MAX_CONTENTION); spin_lock(lock); } } static inline void ext4_unlock_group(struct super_block *sb, ext4_group_t group) { spin_unlock(ext4_group_lock_ptr(sb, group)); } #ifdef CONFIG_QUOTA static inline bool ext4_quota_capable(struct super_block *sb) { return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); } static inline bool ext4_is_quota_journalled(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); return (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); } int ext4_enable_quotas(struct super_block *sb); #endif /* * Block validity checking */ #define ext4_check_indirect_blockref(inode, bh) \ ext4_check_blockref(__func__, __LINE__, inode, \ (__le32 *)(bh)->b_data, \ EXT4_ADDR_PER_BLOCK((inode)->i_sb)) #define ext4_ind_check_inode(inode) \ ext4_check_blockref(__func__, __LINE__, inode, \ EXT4_I(inode)->i_data, \ EXT4_NDIR_BLOCKS) /* * Inodes and files operations */ /* dir.c */ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep, void **fsdata); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); extern int ext4_read_inline_dir(struct file *filp, struct dir_context *ctx, int *has_inline_data); extern int ext4_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); extern int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data); extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); extern void *ext4_read_inline_link(struct inode *inode); struct iomap; extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); static inline int ext4_has_inline_data(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && EXT4_I(inode)->i_inline_off; } /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, struct ext4_dir_entry_2 *de, int blocksize, int csum_size, unsigned int parent_ino, int dotdot_real_len); extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); #define S_SHIFT 12 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, }; static inline void ext4_set_de_type(struct super_block *sb, struct ext4_dir_entry_2 *de, umode_t mode) { if (ext4_has_feature_filetype(sb)) de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } /* readpages.c */ extern int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; /* sysfs.c */ extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); extern int ext4_register_sysfs(struct super_block *sb); extern void ext4_unregister_sysfs(struct super_block *sb); extern int __init ext4_init_sysfs(void); extern void ext4_exit_sysfs(void); /* block_validity */ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); extern int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, ext4_fsblk_t start_blk, unsigned int count); /* extents.c */ struct ext4_ext_path; struct ext4_extent; /* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. */ #define EXT_MAX_BLOCKS 0xffffffff extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path **, struct ext4_extent *, int); extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path **, int flags); extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred); extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end); extern int ext4_ext_replay_set_iblocks(struct inode *inode); extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk); extern int ext4_ext_clear_bb(struct inode *inode); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, struct inode *second); extern void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); extern int ext4_put_io_end(ext4_io_end_t *io_end); extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); /* mmp.c */ extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations ext4_verityops; /* orphan.c */ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es); extern void ext4_release_orphan_info(struct super_block *sb); extern int ext4_init_orphan_info(struct super_block *sb); extern int ext4_orphan_file_empty(struct super_block *sb); extern void ext4_orphan_file_block_trigger( struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size); /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap */ #define BH_BITMAP_UPTODATE BH_JBDPrivateStart static inline int bitmap_uptodate(struct buffer_head *bh) { return (buffer_uptodate(bh) && test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); } static inline void set_bitmap_uptodate(struct buffer_head *bh) { set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } /* For ioend & aio unwritten conversion wait queues */ #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ EXT4_WQ_HASH_SZ]) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { io_end->flag |= EXT4_IO_END_UNWRITTEN; atomic_inc(&EXT4_I(inode)->i_unwritten); } } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; if (io_end->flag & EXT4_IO_END_UNWRITTEN) { io_end->flag &= ~EXT4_IO_END_UNWRITTEN; /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) wake_up_all(ext4_ioend_wq(inode)); } } extern const struct iomap_ops ext4_iomap_ops; extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { /* * If the buffer has the write error flag, we have failed * to write out data in the block. In this case, we don't * have to read the block because we may read the old data * successfully. */ if (buffer_write_io_error(bh)) set_buffer_uptodate(bh); return buffer_uptodate(bh); } #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* _EXT4_H */
11 14210 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM x86_fpu #if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FPU_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(x86_fpu, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu), TP_STRUCT__entry( __field(struct fpu *, fpu) __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH asm/trace/ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE fpu #endif /* _TRACE_FPU_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 /* SPDX-License-Identifier: GPL-2.0+ */ /* * MACsec netdev header, used for h/w accelerated implementations. * * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net> */ #ifndef _NET_MACSEC_H_ #define _NET_MACSEC_H_ #include <linux/u64_stats_sync.h> #include <linux/if_vlan.h> #include <uapi/linux/if_link.h> #include <uapi/linux/if_macsec.h> #define MACSEC_DEFAULT_PN_LEN 4 #define MACSEC_XPN_PN_LEN 8 #define MACSEC_NUM_AN 4 /* 2 bits for the association number */ #define MACSEC_SCI_LEN 8 #define MACSEC_PORT_ES (htons(0x0001)) #define MACSEC_TCI_VERSION 0x80 #define MACSEC_TCI_ES 0x40 /* end station */ #define MACSEC_TCI_SC 0x20 /* SCI present */ #define MACSEC_TCI_SCB 0x10 /* epon */ #define MACSEC_TCI_E 0x08 /* encryption */ #define MACSEC_TCI_C 0x04 /* changed text */ #define MACSEC_AN_MASK 0x03 /* association number */ #define MACSEC_TCI_CONFID (MACSEC_TCI_E | MACSEC_TCI_C) #define MACSEC_DEFAULT_ICV_LEN 16 typedef u64 __bitwise sci_t; typedef u32 __bitwise ssci_t; struct metadata_dst; typedef union salt { struct { u32 ssci; u64 pn; } __packed; u8 bytes[MACSEC_SALT_LEN]; } __packed salt_t; typedef union pn { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u32 lower; u32 upper; #elif defined(__BIG_ENDIAN_BITFIELD) u32 upper; u32 lower; #else #error "Please fix <asm/byteorder.h>" #endif }; u64 full64; } pn_t; /** * struct macsec_key - SA key * @id: user-provided key identifier * @tfm: crypto struct, key storage * @salt: salt used to generate IV in XPN cipher suites */ struct macsec_key { u8 id[MACSEC_KEYID_LEN]; struct crypto_aead *tfm; salt_t salt; }; struct macsec_rx_sc_stats { __u64 InOctetsValidated; __u64 InOctetsDecrypted; __u64 InPktsUnchecked; __u64 InPktsDelayed; __u64 InPktsOK; __u64 InPktsInvalid; __u64 InPktsLate; __u64 InPktsNotValid; __u64 InPktsNotUsingSA; __u64 InPktsUnusedSA; }; struct macsec_rx_sa_stats { __u32 InPktsOK; __u32 InPktsInvalid; __u32 InPktsNotValid; __u32 InPktsNotUsingSA; __u32 InPktsUnusedSA; }; struct macsec_tx_sa_stats { __u32 OutPktsProtected; __u32 OutPktsEncrypted; }; struct macsec_tx_sc_stats { __u64 OutPktsProtected; __u64 OutPktsEncrypted; __u64 OutOctetsProtected; __u64 OutOctetsEncrypted; }; struct macsec_dev_stats { __u64 OutPktsUntagged; __u64 InPktsUntagged; __u64 OutPktsTooLong; __u64 InPktsNoTag; __u64 InPktsBadTag; __u64 InPktsUnknownSCI; __u64 InPktsNoSCI; __u64 InPktsOverrun; }; /** * struct macsec_rx_sa - receive secure association * @active: * @next_pn: packet number expected for the next packet * @lock: protects next_pn manipulations * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats */ struct macsec_rx_sa { struct macsec_key key; ssci_t ssci; spinlock_t lock; union { pn_t next_pn_halves; u64 next_pn; }; refcount_t refcnt; bool active; struct macsec_rx_sa_stats __percpu *stats; struct macsec_rx_sc *sc; struct rcu_head rcu; }; struct pcpu_rx_sc_stats { struct macsec_rx_sc_stats stats; struct u64_stats_sync syncp; }; struct pcpu_tx_sc_stats { struct macsec_tx_sc_stats stats; struct u64_stats_sync syncp; }; /** * struct macsec_rx_sc - receive secure channel * @sci: secure channel identifier for this SC * @active: channel is active * @sa: array of secure associations * @stats: per-SC stats */ struct macsec_rx_sc { struct macsec_rx_sc __rcu *next; sci_t sci; bool active; struct macsec_rx_sa __rcu *sa[MACSEC_NUM_AN]; struct pcpu_rx_sc_stats __percpu *stats; refcount_t refcnt; struct rcu_head rcu_head; }; /** * struct macsec_tx_sa - transmit secure association * @active: * @next_pn: packet number to use for the next packet * @lock: protects next_pn manipulations * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats */ struct macsec_tx_sa { struct macsec_key key; ssci_t ssci; spinlock_t lock; union { pn_t next_pn_halves; u64 next_pn; }; refcount_t refcnt; bool active; struct macsec_tx_sa_stats __percpu *stats; struct rcu_head rcu; }; /** * struct macsec_tx_sc - transmit secure channel * @active: * @encoding_sa: association number of the SA currently in use * @encrypt: encrypt packets on transmit, or authenticate only * @send_sci: always include the SCI in the SecTAG * @end_station: * @scb: single copy broadcast flag * @sa: array of secure associations * @stats: stats for this TXSC * @md_dst: MACsec offload metadata dst */ struct macsec_tx_sc { bool active; u8 encoding_sa; bool encrypt; bool send_sci; bool end_station; bool scb; struct macsec_tx_sa __rcu *sa[MACSEC_NUM_AN]; struct pcpu_tx_sc_stats __percpu *stats; struct metadata_dst *md_dst; }; /** * struct macsec_secy - MACsec Security Entity * @netdev: netdevice for this SecY * @n_rx_sc: number of receive secure channels configured on this SecY * @sci: secure channel identifier used for tx * @key_len: length of keys used by the cipher suite * @icv_len: length of ICV used by the cipher suite * @validate_frames: validation mode * @xpn: enable XPN for this SecY * @operational: MAC_Operational flag * @protect_frames: enable protection for this SecY * @replay_protect: enable packet number checks on receive * @replay_window: size of the replay window * @tx_sc: transmit secure channel * @rx_sc: linked list of receive secure channels */ struct macsec_secy { struct net_device *netdev; unsigned int n_rx_sc; sci_t sci; u16 key_len; u16 icv_len; enum macsec_validation_type validate_frames; bool xpn; bool operational; bool protect_frames; bool replay_protect; u32 replay_window; struct macsec_tx_sc tx_sc; struct macsec_rx_sc __rcu *rx_sc; }; /** * struct macsec_context - MACsec context for hardware offloading */ struct macsec_context { union { struct net_device *netdev; struct phy_device *phydev; }; enum macsec_offload offload; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct { bool update_pn; unsigned char assoc_num; u8 key[MACSEC_MAX_KEY_LEN]; union { struct macsec_rx_sa *rx_sa; struct macsec_tx_sa *tx_sa; }; } sa; union { struct macsec_tx_sc_stats *tx_sc_stats; struct macsec_tx_sa_stats *tx_sa_stats; struct macsec_rx_sc_stats *rx_sc_stats; struct macsec_rx_sa_stats *rx_sa_stats; struct macsec_dev_stats *dev_stats; } stats; }; /** * struct macsec_ops - MACsec offloading operations */ struct macsec_ops { /* Device wide */ int (*mdo_dev_open)(struct macsec_context *ctx); int (*mdo_dev_stop)(struct macsec_context *ctx); /* SecY */ int (*mdo_add_secy)(struct macsec_context *ctx); int (*mdo_upd_secy)(struct macsec_context *ctx); int (*mdo_del_secy)(struct macsec_context *ctx); /* Security channels */ int (*mdo_add_rxsc)(struct macsec_context *ctx); int (*mdo_upd_rxsc)(struct macsec_context *ctx); int (*mdo_del_rxsc)(struct macsec_context *ctx); /* Security associations */ int (*mdo_add_rxsa)(struct macsec_context *ctx); int (*mdo_upd_rxsa)(struct macsec_context *ctx); int (*mdo_del_rxsa)(struct macsec_context *ctx); int (*mdo_add_txsa)(struct macsec_context *ctx); int (*mdo_upd_txsa)(struct macsec_context *ctx); int (*mdo_del_txsa)(struct macsec_context *ctx); /* Statistics */ int (*mdo_get_dev_stats)(struct macsec_context *ctx); int (*mdo_get_tx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_tx_sa_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sa_stats)(struct macsec_context *ctx); }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); static inline bool macsec_send_sci(const struct macsec_secy *secy) { const struct macsec_tx_sc *tx_sc = &secy->tx_sc; return tx_sc->send_sci || (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); } struct net_device *macsec_get_real_dev(const struct net_device *dev); bool macsec_netdev_is_offloaded(struct net_device *dev); static inline void *macsec_netdev_priv(const struct net_device *dev) { #if IS_ENABLED(CONFIG_VLAN_8021Q) if (is_vlan_dev(dev)) return netdev_priv(vlan_dev_priv(dev)->real_dev); #endif return netdev_priv(dev); } #endif /* _NET_MACSEC_H_ */
419 418 419 419 419 419 18 401 401 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2009 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> #include <linux/swapops.h> #include <linux/backing-dev.h> #include <linux/dax.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/pfn_t.h> #include <linux/mman.h> #include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/migrate.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/page_owner.h> #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" #include "swap.h" #define CREATE_TRACE_POINTS #include <trace/events/thp.h> /* * By default, transparent hugepage support is disabled in order to avoid * risking an increased memory footprint for applications that are not * guaranteed to benefit from it. When transparent hugepage support is * enabled, it is for all mappings, and khugepaged scans all mappings. * Defrag is invoked by khugepaged hugepage allocations and by page faults * for all hugepage allocations. */ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS (1<<TRANSPARENT_HUGEPAGE_FLAG)| #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); static struct shrinker *deferred_split_shrinker; static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc); static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, bool smaps, bool in_pf, bool enforce_sysfs) { if (!vma->vm_mm) /* vdso */ return false; /* * Explicitly disabled through madvise or prctl, or some * architectures may disable THP for some mappings, for * example, s390 kvm. * */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) return false; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ if (vma_is_dax(vma)) return in_pf; /* * khugepaged special VMA and hugetlb VMA. * Must be checked after dax since some dax mappings may have * VM_MIXEDMAP set. */ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) return false; /* * Check alignment for file vma and size for both file and anon vma. * * Skip the check for page fault. Huge fault does the check in fault * handlers. And this check is not suitable for huge PUD fault. */ if (!in_pf && !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) return false; /* * Enabled via shmem mount options or sysfs settings. * Must be done before hugepage flags check since shmem has its * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, !enforce_sysfs, vma->vm_mm, vm_flags); /* Enforce sysfs THP requirements as necessary */ if (enforce_sysfs && (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()))) return false; if (!vma_is_anonymous(vma)) { /* * Trust that ->huge_fault() handlers know what they are doing * in fault path. */ if (((in_pf || smaps)) && vma->vm_ops->huge_fault) return true; /* Only regular file is valid in collapse path */ if (((!in_pf || smaps)) && file_thp_enabled(vma)) return true; return false; } if (vma_is_temporary_stack(vma)) return false; /* * THPeligible bit of smaps should show 1 for proper VMAs even * though anon_vma is not initialized yet. * * Allow page fault since anon_vma may be not initialized until * the first page fault. */ if (!vma->anon_vma) return (smaps || in_pf); return true; } static bool get_huge_zero_page(void) { struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) return true; zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_page) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); __free_pages(zero_page, compound_order(zero_page)); goto retry; } WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); count_vm_event(THP_ZERO_PAGE_ALLOC); return true; } static void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put * last reference. */ BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } struct page *mm_get_huge_zero_page(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) return READ_ONCE(huge_zero_page); if (!get_huge_zero_page()) return NULL; if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); return READ_ONCE(huge_zero_page); } void mm_put_huge_zero_page(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); } static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc) { /* we can free zero page only if last reference remains */ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; } static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, struct shrink_control *sc) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct page *zero_page = xchg(&huge_zero_page, NULL); BUG_ON(zero_page == NULL); WRITE_ONCE(huge_zero_pfn, ~0UL); __free_pages(zero_page, compound_order(zero_page)); return HPAGE_PMD_NR; } return 0; } static struct shrinker *huge_zero_page_shrinker; #ifdef CONFIG_SYSFS static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) output = "[always] madvise never"; else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) output = "always [madvise] never"; else output = "always madvise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret = count; if (sysfs_streq(buf, "always")) { clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "never")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); } else ret = -EINVAL; if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } return ret; } static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag) { return sysfs_emit(buf, "%d\n", !!test_bit(flag, &transparent_hugepage_flags)); } ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag) { unsigned long value; int ret; ret = kstrtoul(buf, 10, &value); if (ret < 0) return ret; if (value > 1) return -EINVAL; if (value) set_bit(flag, &transparent_hugepage_flags); else clear_bit(flag, &transparent_hugepage_flags); return count; } static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) output = "[always] defer defer+madvise madvise never"; else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) output = "always [defer] defer+madvise madvise never"; else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) output = "always defer [defer+madvise] madvise never"; else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) output = "always defer defer+madvise [madvise] never"; else output = "always defer defer+madvise madvise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (sysfs_streq(buf, "always")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "defer+madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "defer")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "never")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); } else return -EINVAL; return count; } static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); static ssize_t use_zero_page_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static ssize_t use_zero_page_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); static ssize_t hpage_pmd_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); } static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, &hpage_pmd_size_attr.attr, #ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif NULL, }; static const struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { pr_err("failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { pr_err("failed to register transparent hugepage group\n"); goto remove_hp_group; } return 0; remove_hp_group: sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); delete_obj: kobject_put(*hugepage_kobj); return err; } static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) { sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); kobject_put(hugepage_kobj); } #else static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) { return 0; } static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) { } #endif /* CONFIG_SYSFS */ static int __init thp_shrinker_init(void) { huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); if (!huge_zero_page_shrinker) return -ENOMEM; deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | SHRINKER_NONSLAB, "thp-deferred_split"); if (!deferred_split_shrinker) { shrinker_free(huge_zero_page_shrinker); return -ENOMEM; } huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; shrinker_register(huge_zero_page_shrinker); deferred_split_shrinker->count_objects = deferred_split_count; deferred_split_shrinker->scan_objects = deferred_split_scan; shrinker_register(deferred_split_shrinker); return 0; } static void __init thp_shrinker_exit(void) { shrinker_free(huge_zero_page_shrinker); shrinker_free(deferred_split_shrinker); } static int __init hugepage_init(void) { int err; struct kobject *hugepage_kobj; if (!has_transparent_hugepage()) { transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; return -EINVAL; } /* * hugepages can't be allocated by the buddy allocator */ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_ORDER); /* * we use page->mapping and page->index in second tail page * as list_head: assuming THP order >= 2 */ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs; err = khugepaged_init(); if (err) goto err_slab; err = thp_shrinker_init(); if (err) goto err_shrinker; /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } err = start_stop_khugepaged(); if (err) goto err_khugepaged; return 0; err_khugepaged: thp_shrinker_exit(); err_shrinker: khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); err_sysfs: return err; } subsys_initcall(hugepage_init); static int __init setup_transparent_hugepage(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "always")) { set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); ret = 1; } else if (!strcmp(str, "madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); ret = 1; } else if (!strcmp(str, "never")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); ret = 1; } out: if (!ret) pr_warn("transparent_hugepage= cannot parse, ignored\n"); return ret; } __setup("transparent_hugepage=", setup_transparent_hugepage); pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd, vma); return pmd; } #ifdef CONFIG_MEMCG static inline struct deferred_split *get_deferred_split_queue(struct folio *folio) { struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); if (memcg) return &memcg->deferred_split_queue; else return &pgdat->deferred_split_queue; } #else static inline struct deferred_split *get_deferred_split_queue(struct folio *folio) { struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); return &pgdat->deferred_split_queue; } #endif void folio_prep_large_rmappable(struct folio *folio) { VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); INIT_LIST_HEAD(&folio->_deferred_list); folio_set_large_rmappable(folio); } static inline bool is_transparent_hugepage(struct folio *folio) { if (!folio_test_large(folio)) return false; return is_huge_zero_page(&folio->page) || folio_test_large_rmappable(folio); } static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, loff_t off, unsigned long flags, unsigned long size) { loff_t off_end = off + len; loff_t off_align = round_up(off, size); unsigned long len_pad, ret; if (off_end <= off_align || (off_end - off_align) < size) return 0; len_pad = len + size; if (len_pad < len || (off + len_pad) < off) return 0; ret = current->mm->get_unmapped_area(filp, addr, len_pad, off >> PAGE_SHIFT, flags); /* * The failure might be due to length padding. The caller will retry * without the padding. */ if (IS_ERR_VALUE(ret)) return 0; /* * Do not try to align to THP boundary if allocation at the address * hint succeeds. */ if (ret == addr) return addr; ret += (off - ret) & (size - 1); return ret; } unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long ret; loff_t off = (loff_t)pgoff << PAGE_SHIFT; ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); if (ret) return ret; return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = page_folio(page); pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret = 0; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } folio_throttle_swaprate(folio, gfp); pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { ret = VM_FAULT_OOM; goto release; } clear_huge_page(page, vmf->address, HPAGE_PMD_NR); /* * The memory barrier inside __folio_mark_uptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { goto unlock_release; } else { pmd_t entry; ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock_release; /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); folio_put(folio); pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); folio_add_new_anon_rmap(folio, vma, haddr); folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; unlock_release: spin_unlock(vmf->ptl); release: if (pgtable) pte_free(vma->vm_mm, pgtable); folio_put(folio); return ret; } /* * always: directly stall for all thp allocations * defer: wake kswapd and fail if not immediately available * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise * fail if not immediately available * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately * available * never: never stall for any thp allocation */ gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) { const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); return GFP_TRANSHUGE_LIGHT; } /* Caller must hold page table lock. */ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) { pmd_t entry; if (!pmd_none(*pmd)) return; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); } vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; struct page *zero_page; vm_fault_t ret; pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; if (pmd_none(*vmf->pmd)) { ret = check_stable_address_space(vma->vm_mm); if (ret) { spin_unlock(vmf->ptl); pte_free(vma->vm_mm, pgtable); } else if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, haddr, vmf->pmd, zero_page); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); } } else { spin_unlock(vmf->ptl); pte_free(vma->vm_mm, pgtable); } return ret; } gfp = vma_thp_gfp_mask(vma); folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, pgtable_t pgtable) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; spinlock_t *ptl; ptl = pmd_lock(mm, pmd); if (!pmd_none(*pmd)) { if (write) { if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); goto out_unlock; } entry = pmd_mkyoung(*pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) update_mmu_cache_pmd(vma, addr, pmd); } goto out_unlock; } entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); } if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); out_unlock: spin_unlock(ptl); if (pgtable) pte_free(mm, pgtable); } /** * vmf_insert_pfn_pmd - insert a pmd size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @write: whether it's a write fault * * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { unsigned long addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; pgtable_t pgtable = NULL; /* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && !pfn_t_devmap(pfn)); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; if (arch_needs_pgtable_deposit()) { pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } track_pfn_insert(vma, &pgprot, pfn); insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pud = pud_mkwrite(pud); return pud; } static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, pfn_t pfn, bool write) { struct mm_struct *mm = vma->vm_mm; pgprot_t prot = vma->vm_page_prot; pud_t entry; spinlock_t *ptl; ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { if (write) { if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_huge_zero_pud(*pud)); goto out_unlock; } entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) update_mmu_cache_pud(vma, addr, pud); } goto out_unlock; } entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); out_unlock: spin_unlock(ptl); } /** * vmf_insert_pfn_pud - insert a pud size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @write: whether it's a write fault * * Insert a pud size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { unsigned long addr = vmf->address & PUD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; /* * If we had pud_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && !pfn_t_devmap(pfn)); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); insert_pfn_pud(vma, addr, vmf->pud, pfn, write); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write) { pmd_t _pmd; _pmd = pmd_mkyoung(*pmd); if (write) _pmd = pmd_mkdirty(_pmd); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, pmd, _pmd, write)) update_mmu_cache_pmd(vma, addr, pmd); } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; struct page *page; int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; if (pmd_present(*pmd) && pmd_devmap(*pmd)) /* pass */; else return NULL; if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); /* * device mapped pages can only be returned if the * caller will manage the page reference count. */ if (!(flags & (FOLL_GET | FOLL_PIN))) return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; *pgmap = get_dev_pagemap(pfn, *pgmap); if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); ret = try_grab_page(page, flags); if (ret) page = ERR_PTR(ret); return page; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; pgtable_t pgtable = NULL; int ret = -ENOMEM; /* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0; pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out; dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); VM_BUG_ON(!is_pmd_migration_entry(pmd)); if (!is_readable_migration_entry(entry)) { entry = make_readable_migration_entry( swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); if (pmd_swp_uffd_wp(*src_pmd)) pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); if (!userfaultfd_wp(dst_vma)) pmd = pmd_swp_clear_uffd_wp(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; goto out_unlock; } #endif if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; } /* * When page table lock is held, the huge zero pmd should not be * under splitting since we don't split the page itself, only pmd to * a page table. */ if (is_huge_zero_pmd(pmd)) { /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ mm_get_huge_zero_page(dst_mm); goto out_zero_page; } src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); get_page(src_page); if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) { /* Page maybe pinned: split and retry the fault on PTEs. */ put_page(src_page); pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); return -EAGAIN; } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); out_zero_page: mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); if (!userfaultfd_wp(dst_vma)) pmd = pmd_clear_uffd_wp(pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; out_unlock: spin_unlock(src_ptl); spin_unlock(dst_ptl); out: return ret; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write) { pud_t _pud; _pud = pud_mkyoung(*pud); if (write) _pud = pud_mkdirty(_pud); if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, pud, _pud, write)) update_mmu_cache_pud(vma, addr, pud); } struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; struct page *page; int ret; assert_spin_locked(pud_lockptr(mm, pud)); if (flags & FOLL_WRITE && !pud_write(*pud)) return NULL; if (pud_present(*pud) && pud_devmap(*pud)) /* pass */; else return NULL; if (flags & FOLL_TOUCH) touch_pud(vma, addr, pud, flags & FOLL_WRITE); /* * device mapped pages can only be returned if the * caller will manage the page reference count. * * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here: */ if (!(flags & (FOLL_GET | FOLL_PIN))) return ERR_PTR(-EEXIST); pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; *pgmap = get_dev_pagemap(pfn, *pgmap); if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); ret = try_grab_page(page, flags); if (ret) page = ERR_PTR(ret); return page; } int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma) { spinlock_t *dst_ptl, *src_ptl; pud_t pud; int ret; dst_ptl = pud_lock(dst_mm, dst_pud); src_ptl = pud_lockptr(src_mm, src_pud); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pud = *src_pud; if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) goto out_unlock; /* * When page table lock is held, the huge zero pud should not be * under splitting since we don't split the page itself, only pud to * a page table. */ if (is_huge_zero_pud(pud)) { /* No huge zero pud yet */ } /* * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() * and split if duplicating fails. */ pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_mkold(pud_wrprotect(pud)); set_pud_at(dst_mm, addr, dst_pud, pud); ret = 0; out_unlock: spin_unlock(src_ptl); spin_unlock(dst_ptl); return ret; } void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { bool write = vmf->flags & FAULT_FLAG_WRITE; vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); if (unlikely(!pud_same(*vmf->pud, orig_pud))) goto unlock; touch_pud(vmf->vma, vmf->address, vmf->pud, write); unlock: spin_unlock(vmf->ptl); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ void huge_pmd_set_accessed(struct vm_fault *vmf) { bool write = vmf->flags & FAULT_FLAG_WRITE; vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) goto unlock; touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); unlock: spin_unlock(vmf->ptl); } vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t orig_pmd = vmf->orig_pmd; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) goto fallback; spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); return 0; } page = pmd_page(orig_pmd); folio = page_folio(page); VM_BUG_ON_PAGE(!PageHead(page), page); /* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse; if (!folio_trylock(folio)) { folio_get(folio); spin_unlock(vmf->ptl); folio_lock(folio); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); folio_unlock(folio); folio_put(folio); return 0; } folio_put(folio); } /* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) { folio_unlock(folio); goto reuse; } /* * See do_wp_page(): we can only reuse the folio exclusively if * there are no additional references. Note that we always drain * the LRU cache immediately after adding a THP. */ if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; if (folio_test_swapcache(folio)) folio_free_swap(folio); if (folio_ref_count(folio) == 1) { pmd_t entry; folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(page); folio_unlock(folio); reuse: if (unlikely(unshare)) { spin_unlock(vmf->ptl); return 0; } entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); return 0; } unlock_fallback: folio_unlock(folio); spin_unlock(vmf->ptl); fallback: __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } static inline bool can_change_pmd_writable(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { struct page *page; if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) return false; /* Don't touch entries that are not even readable (NUMA hinting). */ if (pmd_protnone(pmd)) return false; /* Do we need write faults for softdirty tracking? */ if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) return false; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_huge_pmd_wp(vma, pmd)) return false; if (!(vma->vm_flags & VM_SHARED)) { /* See can_change_pte_writable(). */ page = vm_normal_page_pmd(vma, addr, pmd); return page && PageAnon(page) && PageAnonExclusive(page); } /* See can_change_pte_writable(). */ return pmd_dirty(pmd); } /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* If the pmd is writable, we can write to the page. */ if (pmd_write(pmd)) return true; /* Maybe FOLL_FORCE is set to override it? */ if (!(flags & FOLL_FORCE)) return false; /* But FOLL_FORCE has no effect on shared mappings */ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) return false; /* ... or read-only private ones */ if (!(vma->vm_flags & VM_MAYWRITE)) return false; /* ... or already writable ones that just need to take a write fault */ if (vma->vm_flags & VM_WRITE) return false; /* * See can_change_pte_writable(): we broke COW and could map the page * writable if we have an exclusive anonymous page ... */ if (!page || !PageAnon(page) || !PageAnonExclusive(page)) return false; /* ... and a write-fault isn't required for other reasons. */ if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) return false; return !userfaultfd_huge_pmd_wp(vma, pmd); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct page *page; int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); if ((flags & FOLL_WRITE) && !can_follow_write_pmd(*pmd, page, vma, flags)) return NULL; /* Avoid dumping huge zero page */ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) return ERR_PTR(-EFAULT); if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) return NULL; if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); ret = try_grab_page(page, flags); if (ret) return ERR_PTR(ret); if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); return page; } /* NUMA hinting page fault entry point for trans huge pmds */ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pmd_t oldpmd = vmf->orig_pmd; pmd_t pmd; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int nid = NUMA_NO_NODE; int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); bool migrated = false, writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { spin_unlock(vmf->ptl); goto out; } pmd = pmd_modify(oldpmd, vma->vm_page_prot); /* * Detect now whether the PMD could be writable; this information * is only valid while holding the PT lock. */ writable = pmd_write(pmd); if (!writable && vma_wants_manual_pte_write_upgrade(vma) && can_change_pmd_writable(vma, vmf->address, pmd)) writable = true; folio = vm_normal_folio_pmd(vma, haddr, pmd); if (!folio) goto out_map; /* See similar comment in do_numa_page for explanation */ if (!writable) flags |= TNF_NO_GROUP; nid = folio_nid(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ if (node_is_toptier(nid)) last_cpupid = folio_last_cpupid(folio); target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; } spin_unlock(vmf->ptl); writable = false; migrated = migrate_misplaced_folio(folio, vma, target_nid); if (migrated) { flags |= TNF_MIGRATED; nid = target_nid; } else { flags |= TNF_MIGRATE_FAIL; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { spin_unlock(vmf->ptl); goto out; } goto out_map; } out: if (nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); return 0; out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (writable) pmd = pmd_mkwrite(pmd, vma); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); goto out; } /* * Return true if we do MADV_FREE successfully on entire pmd page. * Otherwise, return false. */ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next) { spinlock_t *ptl; pmd_t orig_pmd; struct folio *folio; struct mm_struct *mm = tlb->mm; bool ret = false; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto out; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); goto out; } folio = pfn_folio(pmd_pfn(orig_pmd)); /* * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ if (folio_estimated_sharers(folio) != 1) goto out; if (!folio_trylock(folio)) goto out; /* * If user want to discard part-pages of THP, split it so MADV_FREE * will deactivate only them. */ if (next - addr != HPAGE_PMD_SIZE) { folio_get(folio); spin_unlock(ptl); split_folio(folio); folio_unlock(folio); folio_put(folio); goto out_unlocked; } if (folio_test_dirty(folio)) folio_clear_dirty(folio); folio_unlock(folio); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_mark_lazyfree(folio); ret = true; out: spin_unlock(ptl); out_unlocked: return ret; } static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); mm_dec_nr_ptes(mm); } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; spinlock_t *ptl; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; /* * For architectures like ppc64 we look at deposited pgtable * when calling pmdp_huge_get_and_clear. So do the * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else if (is_huge_zero_pmd(orig_pmd)) { zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else { struct page *page = NULL; int flush_needed = 1; if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); page_remove_rmap(page, vma, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { swp_entry_t entry; VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); page = pfn_swap_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); if (PageAnon(page)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); } spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); } return 1; } #ifndef pmd_move_must_withdraw static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, spinlock_t *old_pmd_ptl, struct vm_area_struct *vma) { /* * With split pmd lock we also need to move preallocated * PTE page table if new_pmd is on different PMD page table. * * We also don't deposit and withdraw tables for file pages. */ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); } #endif static pmd_t move_soft_dirty_pmd(pmd_t pmd) { #ifdef CONFIG_MEM_SOFT_DIRTY if (unlikely(is_pmd_migration_entry(pmd))) pmd = pmd_swp_mksoft_dirty(pmd); else if (pmd_present(pmd)) pmd = pmd_mksoft_dirty(pmd); #endif return pmd; } bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; bool force_flush = false; /* * The destination pmd shouldn't be established, free_pgtables() * should have released it; but move_page_tables() might have already * inserted a page table, if racing against shmem/file collapse. */ if (!pmd_none(*new_pmd)) { VM_BUG_ON(pmd_trans_huge(*new_pmd)); return false; } /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = __pmd_trans_huge_lock(old_pmd, vma); if (old_ptl) { new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); if (pmd_present(pmd)) force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } pmd = move_soft_dirty_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } return false; } /* * Returns * - 0 if PMD could not be locked * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary * or if prot_numa but THP migration is not supported * - HPAGE_PMD_NR if protections changed and TLB flush necessary */ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t oldpmd, entry; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; int ret = 1; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); if (prot_numa && !thp_migration_supported()) return 1; ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so * just be safe and disable write */ if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry(swp_offset(entry)); else entry = make_readable_migration_entry(swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); } else { newpmd = *pmd; } if (uffd_wp) newpmd = pmd_swp_mkuffd_wp(newpmd); else if (uffd_wp_resolve) newpmd = pmd_swp_clear_uffd_wp(newpmd); if (!pmd_same(*pmd, newpmd)) set_pmd_at(mm, addr, pmd, newpmd); goto unlock; } #endif if (prot_numa) { struct folio *folio; bool toptier; /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and * local/remote hits to the zero page are not interesting. */ if (is_huge_zero_pmd(*pmd)) goto unlock; if (pmd_protnone(*pmd)) goto unlock; folio = page_folio(pmd_page(*pmd)); toptier = node_is_toptier(folio_nid(folio)); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) goto unlock; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && !toptier) folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED * which is also under mmap_read_lock(mm): * * CPU0: CPU1: * change_huge_pmd(prot_numa=1) * pmdp_huge_get_and_clear_notify() * madvise_dontneed() * zap_pmd_range() * pmd_trans_huge(*pmd) == 0 (without ptl) * // skip the pmd * set_pmd_at(); * // pmd is re-established * * The race makes MADV_DONTNEED miss the huge pmd and don't clear it * which may break userspace. * * pmdp_invalidate_ad() is required to make sure we don't miss * dirty/young flags set by hardware. */ oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); if (uffd_wp) entry = pmd_mkuffd_wp(entry); else if (uffd_wp_resolve) /* * Leave the write bit to be handled by PF interrupt * handler, then things like COW could be properly * handled. */ entry = pmd_clear_uffd_wp(entry); /* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && can_change_pmd_writable(vma, addr, entry)) entry = pmd_mkwrite(entry, vma); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); if (huge_pmd_needs_flush(oldpmd, entry)) tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); unlock: spin_unlock(ptl); return ret; } /* * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. * * Note that if it returns page table lock pointer, this routine returns without * unlocking page table lock. So callers must unlock it. */ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) return ptl; spin_unlock(ptl); return NULL; } /* * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. * * Note that if it returns page table lock pointer, this routine returns without * unlocking page table lock. So callers must unlock it. */ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pud_lock(vma->vm_mm, pud); if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) return ptl; spin_unlock(ptl); return NULL; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { spinlock_t *ptl; ptl = __pud_trans_huge_lock(pud, vma); if (!ptl) return 0; pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { /* No support for anonymous PUD pages yet */ BUG(); } return 1; } static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, unsigned long haddr) { VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); count_vm_event(THP_SPLIT_PUD); pudp_huge_clear_flush(vma, haddr, pud); } void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address) { spinlock_t *ptl; struct mmu_notifier_range range; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) goto out; __split_huge_pud_locked(vma, pud, range.start); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) { struct mm_struct *mm = vma->vm_mm; pgtable_t pgtable; pmd_t _pmd, old_pmd; unsigned long addr; pte_t *pte; int i; /* * Leave pmd empty until pte is filled note that it is fine to delay * notification until mmu_notifier_invalidate_range_end() as we are * replacing a zero pmd write protected page with a zero pte write * protected page. * * See Documentation/mm/mmu_notifier.rst */ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte); for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { pte_t entry; entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } pte_unmap(pte - 1); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); } static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, unsigned long haddr, bool freeze) { struct mm_struct *mm = vma->vm_mm; struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; bool anon_exclusive = false, dirty = false; unsigned long addr; pte_t *pte; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); count_vm_event(THP_SPLIT_PMD); if (!vma_is_anonymous(vma)) { old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); /* * We are going to unmap this huge page. So * just go ahead and zap it */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); if (vma_is_special_huge(vma)) return; if (unlikely(is_pmd_migration_entry(old_pmd))) { swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); page = pfn_swap_entry_to_page(entry); } else { page = pmd_page(old_pmd); if (!PageDirty(page) && pmd_dirty(old_pmd)) set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); page_remove_rmap(page, vma, true); put_page(page); } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; } if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below * inside __split_huge_pmd() ? * * We are going from a zero huge page write protected to zero * small page also write protected so it does not seems useful * to invalidate secondary mmu at this time. */ return __split_huge_zero_page_pmd(vma, haddr, pmd); } /* * Up to this point the pmd is present and huge and userland has the * whole access to the hugepage during the split (which happens in * place). If we overwrite the pmd with the not-huge version pointing * to the pte here (which of course we could if all CPUs were bug * free), userland could trigger a small page size TLB miss on the * small sized TLB while the hugepage TLB entry is still established in * the huge TLB. Some CPU doesn't like that. * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum * 383 on page 105. Intel should be safe but is also warns that it's * only safe if the permission and cache attributes of the two entries * loaded in the two TLB is identical (which should be the case here). * But it is generally safer to never allow small and huge TLB entries * for the same virtual address to be loaded simultaneously. So instead * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the * current pmd notpresent (atomically because here the pmd_trans_huge * must remain set at all times on the pmd until the split is complete * for this pmd), then we flush the SMP TLB and finally we write the * non-huge version of the pmd entry with pmd_populate. */ old_pmd = pmdp_invalidate(vma, haddr, pmd); pmd_migration = is_pmd_migration_entry(old_pmd); if (unlikely(pmd_migration)) { swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); page = pfn_swap_entry_to_page(entry); write = is_writable_migration_entry(entry); if (PageAnon(page)) anon_exclusive = is_readable_exclusive_migration_entry(entry); young = is_migration_entry_young(entry); dirty = is_migration_entry_dirty(entry); soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { page = pmd_page(old_pmd); if (pmd_dirty(old_pmd)) { dirty = true; SetPageDirty(page); } write = pmd_write(old_pmd); young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); uffd_wp = pmd_uffd_wp(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); /* * Without "freeze", we'll simply split the PMD, propagating the * PageAnonExclusive() flag for each PTE by setting it for * each subpage -- no need to (temporarily) clear. * * With "freeze" we want to replace mapped pages by * migration entries right away. This is only possible if we * managed to clear PageAnonExclusive() -- see * set_pmd_migration_entry(). * * In case we cannot clear PageAnonExclusive(), split the PMD * only and let try_to_migrate_one() fail later. * * See page_try_share_anon_rmap(): invalidate PMD first. */ anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) freeze = false; if (!freeze) page_ref_add(page, HPAGE_PMD_NR - 1); } /* * Withdraw the table only after we mark the pmd entry invalid. * This's critical for some architectures (Power). */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte); for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { pte_t entry; /* * Note that NUMA hinting access restrictions are not * transferred to avoid any possibility of altering * permissions across VMAs. */ if (freeze || pmd_migration) { swp_entry_t swp_entry; if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); else if (anon_exclusive) swp_entry = make_readable_exclusive_migration_entry( page_to_pfn(page + i)); else swp_entry = make_readable_migration_entry( page_to_pfn(page + i)); if (young) swp_entry = make_migration_entry_young(swp_entry); if (dirty) swp_entry = make_migration_entry_dirty(swp_entry); entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); if (write) entry = pte_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); if (!young) entry = pte_mkold(entry); /* NOTE: this may set soft-dirty too on some archs */ if (dirty) entry = pte_mkdirty(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) entry = pte_mkuffd_wp(entry); page_add_anon_rmap(page + i, vma, addr, RMAP_NONE); } VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } pte_unmap(pte - 1); if (!pmd_migration) page_remove_rmap(page, vma, true); if (freeze) put_page(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio) { spinlock_t *ptl; struct mmu_notifier_range range; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); /* * If caller asks to setup a migration entry, we need a folio to check * pmd against. Otherwise we can end up replacing wrong folio. */ VM_BUG_ON(freeze && !folio); VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)) { /* * It's safe to call pmd_page when folio is set because it's * guaranteed that pmd is present. */ if (folio && folio != page_folio(pmd_page(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); } out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) { pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); if (!pmd) return; __split_huge_pmd(vma, pmd, address, freeze, folio); } static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) { /* * If the new address isn't hpage aligned and it could previously * contain an hugepage: check if we need to split an huge pmd. */ if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), ALIGN(address, HPAGE_PMD_SIZE))) split_huge_pmd_address(vma, address, false, NULL); } void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { /* Check if we need to split start first. */ split_huge_pmd_if_needed(vma, start); /* Check if we need to split end next. */ split_huge_pmd_if_needed(vma, end); /* * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); } } static void unmap_folio(struct folio *folio) { enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); /* * Anon pages need migration entries to preserve them, but file * pages can simply be left unmapped, then faulted back on demand. * If that is ever changed (perhaps for mlock), update remap_page(). */ if (folio_test_anon(folio)) try_to_migrate(folio, ttu_flags); else try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); } static void remap_page(struct folio *folio, unsigned long nr) { int i = 0; /* If unmap_folio() uses try_to_migrate() on file, remove this check */ if (!folio_test_anon(folio)) return; for (;;) { remove_migration_ptes(folio, folio, true); i += folio_nr_pages(folio); if (i >= nr) break; folio = folio_next(folio); } } static void lru_add_page_tail(struct page *head, struct page *tail, struct lruvec *lruvec, struct list_head *list) { VM_BUG_ON_PAGE(!PageHead(head), head); VM_BUG_ON_PAGE(PageCompound(tail), head); VM_BUG_ON_PAGE(PageLRU(tail), head); lockdep_assert_held(&lruvec->lru_lock); if (list) { /* page reclaim is reclaiming a huge page */ VM_WARN_ON(PageLRU(head)); get_page(tail); list_add_tail(&tail->lru, list); } else { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!PageLRU(head)); if (PageUnevictable(tail)) tail->mlock_count = 0; else list_add_tail(&tail->lru, &head->lru); SetPageLRU(tail); } } static void __split_huge_page_tail(struct folio *folio, int tail, struct lruvec *lruvec, struct list_head *list) { struct page *head = &folio->page; struct page *page_tail = head + tail; /* * Careful: new_folio is not a "real" folio before we cleared PageTail. * Don't pass it around before clear_compound_head(). */ struct folio *new_folio = (struct folio *)page_tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); /* * Clone page flags before unfreezing refcount. * * After successful get_page_unless_zero() might follow flags change, * for example lock_page() which set PG_waiters. * * Note that for mapped sub-pages of an anonymous THP, * PG_anon_exclusive has been cleared in unmap_folio() and is stored in * the migration entry instead from where remap_page() will restore it. * We can still have PG_anon_exclusive set on effectively unmapped and * unreferenced sub-pages of an anonymous THP: we can simply drop * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_swapcache) | (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | #ifdef CONFIG_ARCH_USES_PG_ARCH_X (1L << PG_arch_2) | (1L << PG_arch_3) | #endif (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; /* * page->private should not be set in tail pages. Fix up and warn once * if private is unexpectedly set. */ if (unlikely(page_tail->private)) { VM_WARN_ON_ONCE_PAGE(true, page_tail); page_tail->private = 0; } if (folio_test_swapcache(folio)) new_folio->swap.val = folio->swap.val + tail; /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); /* * Clear PageTail before unfreezing page refcount. * * After successful get_page_unless_zero() might follow put_page() * which needs correct compound_head(). */ clear_compound_head(page_tail); /* Finally unfreeze refcount. Additional reference from page cache. */ page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || PageSwapCache(head))); if (page_is_young(head)) set_page_young(page_tail); if (page_is_idle(head)) set_page_idle(page_tail); folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); /* * always add to the tail because some iterators expect new * pages to show after the currently processed elements - e.g. * migrate_pages */ lru_add_page_tail(head, page_tail, lruvec, list); } static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end) { struct folio *folio = page_folio(page); struct page *head = &folio->page; struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); int i, nr_dropped = 0; /* complete memcg works before add pages to LRU */ split_page_memcg(head, nr); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swp_offset(folio->swap); swap_cache = swap_address_space(folio->swap); xa_lock(&swap_cache->i_pages); } /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = folio_lruvec_lock(folio); ClearPageHasHWPoisoned(head); for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(folio, i, lruvec, list); /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { struct folio *tail = page_folio(head + i); if (shmem_mapping(head->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); folio_put(tail); } else if (!PageAnon(page)) { __xa_store(&head->mapping->i_pages, head[i].index, head + i, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, head + i, 0); } } ClearPageCompound(head); unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, nr); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ if (PageSwapCache(head)) { page_ref_add(head, 2); xa_unlock(&swap_cache->i_pages); } else { page_ref_inc(head); } } else { /* Additional pin to page cache */ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } local_irq_enable(); if (nr_dropped) shmem_uncharge(head->mapping->host, nr_dropped); remap_page(folio, nr); if (folio_test_swapcache(folio)) split_swap_cluster(folio->swap); for (i = 0; i < nr; i++) { struct page *subpage = head + i; if (subpage == page) continue; unlock_page(subpage); /* * Subpages may be freed if there wasn't any mapping * like if add_to_swap() is running on a lru page that * had its mapping zapped. And freeing these pages * requires taking the lru_lock so we do the put_page * of the tail pages after the split is complete. */ free_page_and_swap_cache(subpage); } } /* Racy check whether the huge page can be split */ bool can_split_folio(struct folio *folio, int *pextra_pins) { int extra_pins; /* Additional pins from page cache */ if (folio_test_anon(folio)) extra_pins = folio_test_swapcache(folio) ? folio_nr_pages(folio) : 0; else extra_pins = folio_nr_pages(folio); if (pextra_pins) *pextra_pins = extra_pins; return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; } /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. * * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. * The huge page must be locked. * * If @list is null, tail pages will be added to LRU list, otherwise, to @list. * * Both head page and tail pages will inherit mapping, flags, and so on from * the hugepage. * * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if * they are not mapped. * * Returns 0 if the hugepage is split successfully. * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under * us. */ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; pgoff_t end; bool is_hzp; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); is_hzp = is_huge_zero_page(&folio->page); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; } if (folio_test_writeback(folio)) return -EBUSY; if (folio_test_anon(folio)) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a * reference to it and then lock the anon_vma for write. This * is similar to folio_lock_anon_vma_read except the write lock * is taken to serialise against parallel split or collapse * operations. */ anon_vma = folio_get_anon_vma(folio); if (!anon_vma) { ret = -EBUSY; goto out; } end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { gfp_t gfp; mapping = folio->mapping; /* Truncated ? */ if (!mapping) { ret = -EBUSY; goto out; } gfp = current_gfp_context(mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); if (!filemap_release_folio(folio, gfp)) { ret = -EBUSY; goto out; } xas_split_alloc(&xas, folio, folio_order(folio), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; } anon_vma = NULL; i_mmap_lock_read(mapping); /* *__split_huge_page() may need to trim off pages beyond EOF: * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, * which cannot be nested inside the page tree lock. So note * end now: i_size itself may be changed at any moment, but * folio lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping)) end = shmem_fallocend(mapping->host, end); } /* * Racy check if we can split the page, before unmap_folio() will * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { ret = -EAGAIN; goto out_unlock; } unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { /* * Check if the folio is present in page cache. * We assume all tail are present too, if folio is there. */ xas_lock(&xas); xas_reset(&xas); if (xas_load(&xas) != folio) goto fail; } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; list_del(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); if (folio_test_pmd_mappable(folio)) { if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } } } __split_huge_page(page, list, end); ret = 0; } else { spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) xas_unlock(&xas); local_irq_enable(); remap_page(folio, folio_nr_pages(folio)); ret = -EAGAIN; } out_unlock: if (anon_vma) { anon_vma_unlock_write(anon_vma); put_anon_vma(anon_vma); } if (mapping) i_mmap_unlock_read(mapping); out: xas_destroy(&xas); count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } void folio_undo_large_rmappable(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) return; ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; list_del(&folio->_deferred_list); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } void deferred_split_folio(struct folio *folio) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = folio_memcg(folio); #endif unsigned long flags; VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); /* * The try_to_unmap() in page reclaim path might reach here too, * this may cause a race condition to corrupt deferred split queue. * And, if page reclaim is already handling the same folio, it is * unnecessary to handle it again in shrinker. * * Check the swapcache flag to determine if the folio is being * handled by page reclaim since THP swap would add the folio into * swap cache before calling try_to_unmap(). */ if (folio_test_swapcache(folio)) return; if (!list_empty(&folio->_deferred_list)) return; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(&folio->_deferred_list)) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), deferred_split_shrinker->id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; #ifdef CONFIG_MEMCG if (sc->memcg) ds_queue = &sc->memcg->deferred_split_queue; #endif return READ_ONCE(ds_queue->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list); struct folio *folio, *next; int split = 0; #ifdef CONFIG_MEMCG if (sc->memcg) ds_queue = &sc->memcg->deferred_split_queue; #endif spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_entry_safe(folio, next, &ds_queue->split_queue, _deferred_list) { if (folio_try_get(folio)) { list_move(&folio->_deferred_list, &list); } else { /* We lost race with folio_put() */ list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) break; } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_entry_safe(folio, next, &list, _deferred_list) { if (!folio_trylock(folio)) goto next; /* split_huge_page() removes page from list on success */ if (!split_folio(folio)) split++; folio_unlock(folio); next: folio_put(folio); } spin_lock_irqsave(&ds_queue->split_queue_lock, flags); list_splice_tail(&list, &ds_queue->split_queue); spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. */ if (!split && list_empty(&ds_queue->split_queue)) return SHRINK_STOP; return split; } #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { struct zone *zone; struct page *page; struct folio *folio; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; pr_debug("Split all THPs\n"); for_each_zone(zone) { if (!managed_zone(zone)) continue; max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { int nr_pages; page = pfn_to_online_page(pfn); if (!page || PageTail(page)) continue; folio = page_folio(page); if (!folio_try_get(folio)) continue; if (unlikely(page_folio(page) != folio)) goto next; if (zone != folio_zone(folio)) goto next; if (!folio_test_large(folio) || folio_test_hugetlb(folio) || !folio_test_lru(folio)) goto next; total++; folio_lock(folio); nr_pages = folio_nr_pages(folio); if (!split_folio(folio)) split++; pfn += nr_pages - 1; folio_unlock(folio); next: folio_put(folio); cond_resched(); } } pr_debug("%lu of %lu THP split\n", split, total); } static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) { return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || is_vm_hugetlb_page(vma); } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, unsigned long vaddr_end) { int ret = 0; struct task_struct *task; struct mm_struct *mm; unsigned long total = 0, split = 0; unsigned long addr; vaddr_start &= PAGE_MASK; vaddr_end &= PAGE_MASK; /* Find the task_struct from pid */ rcu_read_lock(); task = find_task_by_vpid(pid); if (!task) { rcu_read_unlock(); ret = -ESRCH; goto out; } get_task_struct(task); rcu_read_unlock(); /* Find the mm_struct */ mm = get_task_mm(task); put_task_struct(task); if (!mm) { ret = -EINVAL; goto out; } pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", pid, vaddr_start, vaddr_end); mmap_read_lock(mm); /* * always increase addr by PAGE_SIZE, since we could have a PTE page * table filled with PTE-mapped THPs, each of which is distinct. */ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { struct vm_area_struct *vma = vma_lookup(mm, addr); struct page *page; struct folio *folio; if (!vma) break; /* skip special VMA and hugetlb VMA */ if (vma_not_suitable_for_thp_split(vma)) { addr = vma->vm_end; continue; } /* FOLL_DUMP to ignore special (like zero) pages */ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); if (IS_ERR_OR_NULL(page)) continue; folio = page_folio(page); if (!is_transparent_hugepage(folio)) goto next; total++; if (!can_split_folio(folio, NULL)) goto next; if (!folio_trylock(folio)) goto next; if (!split_folio(folio)) split++; folio_unlock(folio); next: folio_put(folio); cond_resched(); } mmap_read_unlock(mm); mmput(mm); pr_debug("%lu of %lu THP split\n", split, total); out: return ret; } static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, pgoff_t off_end) { struct filename *file; struct file *candidate; struct address_space *mapping; int ret = -EINVAL; pgoff_t index; int nr_pages = 1; unsigned long total = 0, split = 0; file = getname_kernel(file_path); if (IS_ERR(file)) return ret; candidate = file_open_name(file, O_RDONLY, 0); if (IS_ERR(candidate)) goto out; pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", file_path, off_start, off_end); mapping = candidate->f_mapping; for (index = off_start; index < off_end; index += nr_pages) { struct folio *folio = filemap_get_folio(mapping, index); nr_pages = 1; if (IS_ERR(folio)) continue; if (!folio_test_large(folio)) goto next; total++; nr_pages = folio_nr_pages(folio); if (!folio_trylock(folio)) goto next; if (!split_folio(folio)) split++; folio_unlock(folio); next: folio_put(folio); cond_resched(); } filp_close(candidate, NULL); ret = 0; pr_debug("%lu of %lu file-backed THP split\n", split, total); out: putname(file); return ret; } #define MAX_INPUT_BUF_SZ 255 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, size_t count, loff_t *ppops) { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; ret = mutex_lock_interruptible(&split_debug_mutex); if (ret) return ret; ret = -EFAULT; memset(input_buf, 0, MAX_INPUT_BUF_SZ); if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) goto out; input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; if (input_buf[0] == '/') { char *tok; char *buf = input_buf; char file_path[MAX_INPUT_BUF_SZ]; pgoff_t off_start = 0, off_end = 0; size_t input_len = strlen(input_buf); tok = strsep(&buf, ","); if (tok) { strcpy(file_path, tok); } else { ret = -EINVAL; goto out; } ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); if (ret != 2) { ret = -EINVAL; goto out; } ret = split_huge_pages_in_file(file_path, off_start, off_end); if (!ret) ret = input_len; goto out; } ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); if (ret == 1 && pid == 1) { split_huge_pages_all(); ret = strlen(input_buf); goto out; } else if (ret != 3) { ret = -EINVAL; goto out; } ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); if (!ret) ret = strlen(input_buf); out: mutex_unlock(&split_debug_mutex); return ret; } static const struct file_operations split_huge_pages_fops = { .owner = THIS_MODULE, .write = split_huge_pages_write, .llseek = no_llseek, }; static int __init split_huge_pages_debugfs(void) { debugfs_create_file("split_huge_pages", 0200, NULL, NULL, &split_huge_pages_fops); return 0; } late_initcall(split_huge_pages_debugfs); #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long address = pvmw->address; bool anon_exclusive; pmd_t pmdval; swp_entry_t entry; pmd_t pmdswp; if (!(pvmw->pmd && !pvmw->pte)) return 0; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); /* See page_try_share_anon_rmap(): invalidate PMD first. */ anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive && page_try_share_anon_rmap(page)) { set_pmd_at(mm, address, pvmw->pmd, pmdval); return -EBUSY; } if (pmd_dirty(pmdval)) set_page_dirty(page); if (pmd_write(pmdval)) entry = make_writable_migration_entry(page_to_pfn(page)); else if (anon_exclusive) entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); else entry = make_readable_migration_entry(page_to_pfn(page)); if (pmd_young(pmdval)) entry = make_migration_entry_young(entry); if (pmd_dirty(pmdval)) entry = make_migration_entry_dirty(entry); pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); if (pmd_uffd_wp(pmdval)) pmdswp = pmd_swp_mkuffd_wp(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); page_remove_rmap(page, vma, true); put_page(page); trace_set_migration_pmd(address, pmd_val(pmdswp)); return 0; } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long address = pvmw->address; unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t pmde; swp_entry_t entry; if (!(pvmw->pmd && !pvmw->pte)) return; entry = pmd_to_swp_entry(*pvmw->pmd); get_page(new); pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) pmde = pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ if (PageDirty(new) && is_migration_entry_dirty(entry)) pmde = pmd_mkdirty(pmde); if (PageAnon(new)) { rmap_t rmap_flags = RMAP_COMPOUND; if (!is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(new, vma, haddr, rmap_flags); } else { page_add_file_rmap(new, vma, true); } VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new)); set_pmd_at(mm, haddr, pvmw->pmd, pmde); /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); trace_remove_migration_pmd(address, pmd_val(pmde)); } #endif
142 142 141 142 142 1158 142 142 1158 997 997 997 1057 1057 997 997 997 996 997 997 997 1359 47 509 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * * File: integrity_iint.c * - implements the integrity hooks: integrity_inode_alloc, * integrity_inode_free * - cache integrity information associated with an inode * using a rbtree tree. */ #include <linux/slab.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/security.h> #include <linux/lsm_hooks.h> #include "integrity.h" static struct rb_root integrity_iint_tree = RB_ROOT; static DEFINE_RWLOCK(integrity_iint_lock); static struct kmem_cache *iint_cache __ro_after_init; struct dentry *integrity_dir; /* * __integrity_iint_find - return the iint associated with an inode */ static struct integrity_iint_cache *__integrity_iint_find(struct inode *inode) { struct integrity_iint_cache *iint; struct rb_node *n = integrity_iint_tree.rb_node; while (n) { iint = rb_entry(n, struct integrity_iint_cache, rb_node); if (inode < iint->inode) n = n->rb_left; else if (inode > iint->inode) n = n->rb_right; else return iint; } return NULL; } /* * integrity_iint_find - return the iint associated with an inode */ struct integrity_iint_cache *integrity_iint_find(struct inode *inode) { struct integrity_iint_cache *iint; if (!IS_IMA(inode)) return NULL; read_lock(&integrity_iint_lock); iint = __integrity_iint_find(inode); read_unlock(&integrity_iint_lock); return iint; } #define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH+1) /* * It is not clear that IMA should be nested at all, but as long is it measures * files both on overlayfs and on underlying fs, we need to annotate the iint * mutex to avoid lockdep false positives related to IMA + overlayfs. * See ovl_lockdep_annotate_inode_mutex_key() for more details. */ static inline void iint_lockdep_annotate(struct integrity_iint_cache *iint, struct inode *inode) { #ifdef CONFIG_LOCKDEP static struct lock_class_key iint_mutex_key[IMA_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth; if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING)) depth = 0; lockdep_set_class(&iint->mutex, &iint_mutex_key[depth]); #endif } static void iint_init_always(struct integrity_iint_cache *iint, struct inode *inode) { iint->ima_hash = NULL; iint->version = 0; iint->flags = 0UL; iint->atomic_flags = 0UL; iint->ima_file_status = INTEGRITY_UNKNOWN; iint->ima_mmap_status = INTEGRITY_UNKNOWN; iint->ima_bprm_status = INTEGRITY_UNKNOWN; iint->ima_read_status = INTEGRITY_UNKNOWN; iint->ima_creds_status = INTEGRITY_UNKNOWN; iint->evm_status = INTEGRITY_UNKNOWN; iint->measured_pcrs = 0; mutex_init(&iint->mutex); iint_lockdep_annotate(iint, inode); } static void iint_free(struct integrity_iint_cache *iint) { kfree(iint->ima_hash); mutex_destroy(&iint->mutex); kmem_cache_free(iint_cache, iint); } /** * integrity_inode_get - find or allocate an iint associated with an inode * @inode: pointer to the inode * @return: allocated iint * * Caller must lock i_mutex */ struct integrity_iint_cache *integrity_inode_get(struct inode *inode) { struct rb_node **p; struct rb_node *node, *parent = NULL; struct integrity_iint_cache *iint, *test_iint; iint = integrity_iint_find(inode); if (iint) return iint; iint = kmem_cache_alloc(iint_cache, GFP_NOFS); if (!iint) return NULL; iint_init_always(iint, inode); write_lock(&integrity_iint_lock); p = &integrity_iint_tree.rb_node; while (*p) { parent = *p; test_iint = rb_entry(parent, struct integrity_iint_cache, rb_node); if (inode < test_iint->inode) { p = &(*p)->rb_left; } else if (inode > test_iint->inode) { p = &(*p)->rb_right; } else { write_unlock(&integrity_iint_lock); kmem_cache_free(iint_cache, iint); return test_iint; } } iint->inode = inode; node = &iint->rb_node; inode->i_flags |= S_IMA; rb_link_node(node, parent, p); rb_insert_color(node, &integrity_iint_tree); write_unlock(&integrity_iint_lock); return iint; } /** * integrity_inode_free - called on security_inode_free * @inode: pointer to the inode * * Free the integrity information(iint) associated with an inode. */ void integrity_inode_free(struct inode *inode) { struct integrity_iint_cache *iint; if (!IS_IMA(inode)) return; write_lock(&integrity_iint_lock); iint = __integrity_iint_find(inode); rb_erase(&iint->rb_node, &integrity_iint_tree); write_unlock(&integrity_iint_lock); iint_free(iint); } static void iint_init_once(void *foo) { struct integrity_iint_cache *iint = (struct integrity_iint_cache *) foo; memset(iint, 0, sizeof(*iint)); } static int __init integrity_iintcache_init(void) { iint_cache = kmem_cache_create("iint_cache", sizeof(struct integrity_iint_cache), 0, SLAB_PANIC, iint_init_once); return 0; } DEFINE_LSM(integrity) = { .name = "integrity", .init = integrity_iintcache_init, .order = LSM_ORDER_LAST, }; /* * integrity_kernel_read - read data from the file * * This is a function for reading file content instead of kernel_read(). * It does not perform locking checks to ensure it cannot be blocked. * It does not perform security checks because it is irrelevant for IMA. * */ int integrity_kernel_read(struct file *file, loff_t offset, void *addr, unsigned long count) { return __kernel_read(file, addr, count, &offset); } /* * integrity_load_keys - load integrity keys hook * * Hooks is called from init/main.c:kernel_init_freeable() * when rootfs is ready */ void __init integrity_load_keys(void) { ima_load_x509(); if (!IS_ENABLED(CONFIG_IMA_LOAD_X509)) evm_load_x509(); } static int __init integrity_fs_init(void) { integrity_dir = securityfs_create_dir("integrity", NULL); if (IS_ERR(integrity_dir)) { int ret = PTR_ERR(integrity_dir); if (ret != -ENODEV) pr_err("Unable to create integrity sysfs dir: %d\n", ret); integrity_dir = NULL; return ret; } return 0; } late_initcall(integrity_fs_init)
2 1 2 2 6 2 6 2 6 1 5 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */ #include <linux/skmsg.h> #include <net/sock.h> #include <net/udp.h> #include <net/inet_common.h> #include "udp_impl.h" static struct proto *udpv6_prot_saved __read_mostly; static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len); #endif return udp_prot.recvmsg(sk, msg, len, flags, addr_len); } static bool udp_sk_has_data(struct sock *sk) { return !skb_queue_empty(&udp_sk(sk)->reader_queue) || !skb_queue_empty(&sk->sk_receive_queue); } static bool psock_has_data(struct sk_psock *psock) { return !skb_queue_empty(&psock->ingress_skb) || !sk_psock_queue_empty(psock); } #define udp_msg_has_data(__sk, __psock) \ ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = udp_msg_has_data(sk, psock); if (!ret) { wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ret = udp_msg_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, flags, addr_len); if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); goto out; } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = udp_msg_wait_data(sk, psock, timeo); if (data) { if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); goto out; } copied = -EAGAIN; } ret = copied; out: sk_psock_put(sk, psock); return ret; } enum { UDP_BPF_IPV4, UDP_BPF_IPV6, UDP_BPF_NUM_PROTS, }; static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = udp_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { spin_lock_bh(&udpv6_prot_lock); if (likely(ops != udpv6_prot_saved)) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops); smp_store_release(&udpv6_prot_saved, ops); } spin_unlock_bh(&udpv6_prot_lock); } } static int __init udp_bpf_v4_build_proto(void) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot); return 0; } late_initcall(udp_bpf_v4_build_proto); int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &udp_bpf_prots[family]); return 0; } EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* vcan.c - Virtual CAN interface * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/can.h> #include <linux/can/can-ml.h> #include <linux/can/dev.h> #include <linux/can/skb.h> #include <linux/slab.h> #include <net/rtnetlink.h> #define DRV_NAME "vcan" MODULE_DESCRIPTION("virtual CAN interface"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); /* CAN test feature: * Enable the echo on driver level for testing the CAN core echo modes. * See Documentation/networking/can.rst for details. */ static bool echo; /* echo testing. Default: 0 (Off) */ module_param(echo, bool, 0444); MODULE_PARM_DESC(echo, "Echo sent frames (for testing). Default: 0 (Off)"); static void vcan_rx(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; stats->rx_packets++; stats->rx_bytes += can_skb_get_data_len(skb); skb->pkt_type = PACKET_BROADCAST; skb->dev = dev; skb->ip_summed = CHECKSUM_UNNECESSARY; netif_rx(skb); } static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len; int loop; if (can_dropped_invalid_skb(dev, skb)) return NETDEV_TX_OK; len = can_skb_get_data_len(skb); stats->tx_packets++; stats->tx_bytes += len; /* set flag whether this packet has to be looped back */ loop = skb->pkt_type == PACKET_LOOPBACK; skb_tx_timestamp(skb); if (!echo) { /* no echo handling available inside this driver */ if (loop) { /* only count the packets here, because the * CAN core already did the echo for us */ stats->rx_packets++; stats->rx_bytes += len; } consume_skb(skb); return NETDEV_TX_OK; } /* perform standard echo handling for CAN network interfaces */ if (loop) { skb = can_create_echo_skb(skb); if (!skb) return NETDEV_TX_OK; /* receive with packet counting */ vcan_rx(skb, dev); } else { /* no looped packets => no counting */ consume_skb(skb); } return NETDEV_TX_OK; } static int vcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ if (dev->flags & IFF_UP) return -EBUSY; if (new_mtu != CAN_MTU && new_mtu != CANFD_MTU && !can_is_canxl_dev_mtu(new_mtu)) return -EINVAL; dev->mtu = new_mtu; return 0; } static const struct net_device_ops vcan_netdev_ops = { .ndo_start_xmit = vcan_tx, .ndo_change_mtu = vcan_change_mtu, }; static const struct ethtool_ops vcan_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static void vcan_setup(struct net_device *dev) { dev->type = ARPHRD_CAN; dev->mtu = CANFD_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 0; dev->flags = IFF_NOARP; can_set_ml_priv(dev, netdev_priv(dev)); /* set flags according to driver capabilities */ if (echo) dev->flags |= IFF_ECHO; dev->netdev_ops = &vcan_netdev_ops; dev->ethtool_ops = &vcan_ethtool_ops; dev->needs_free_netdev = true; } static struct rtnl_link_ops vcan_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct can_ml_priv), .setup = vcan_setup, }; static __init int vcan_init_module(void) { pr_info("Virtual CAN interface driver\n"); if (echo) pr_info("enabled echo on driver level.\n"); return rtnl_link_register(&vcan_link_ops); } static __exit void vcan_cleanup_module(void) { rtnl_link_unregister(&vcan_link_ops); } module_init(vcan_init_module); module_exit(vcan_cleanup_module);
1475 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0-or-later /* * "TEE" target extension for Xtables * Copyright © Sebastian Claßen, 2007 * Jan Engelhardt, 2007-2010 * * based on ipt_ROUTE.c from Cédric de Launois * <delaunois@info.ucl.be> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/route.h> #include <linux/netfilter/x_tables.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #include <net/netfilter/ipv6/nf_dup_ipv6.h> #include <linux/netfilter/xt_TEE.h> struct xt_tee_priv { struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; static unsigned int tee_net_id __read_mostly; static const union nf_inet_addr tee_zero_address; struct tee_net { struct list_head priv_list; /* lock protects the priv_list */ struct mutex lock; }; static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif); return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif); return XT_CONTINUE; } #endif static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct tee_net *tn = net_generic(net, tee_net_id); struct xt_tee_priv *priv; mutex_lock(&tn->lock); list_for_each_entry(priv, &tn->priv_list, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; break; case NETDEV_UNREGISTER: if (dev->ifindex == priv->oif) priv->oif = -1; break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; else if (dev->ifindex == priv->oif) priv->oif = -1; break; } } mutex_unlock(&tn->lock); return NOTIFY_DONE; } static int tee_tg_check(const struct xt_tgchk_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; struct xt_tee_priv *priv; /* 0.0.0.0 and :: not allowed */ if (memcmp(&info->gw, &tee_zero_address, sizeof(tee_zero_address)) == 0) return -EINVAL; if (info->oif[0]) { struct net_device *dev; if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; priv->tginfo = info; priv->oif = -1; info->priv = priv; dev = dev_get_by_name(par->net, info->oif); if (dev) { priv->oif = dev->ifindex; dev_put(dev); } mutex_lock(&tn->lock); list_add(&priv->list, &tn->priv_list); mutex_unlock(&tn->lock); } else info->priv = NULL; static_key_slow_inc(&xt_tee_enabled); return 0; } static void tee_tg_destroy(const struct xt_tgdtor_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { mutex_lock(&tn->lock); list_del(&info->priv->list); mutex_unlock(&tn->lock); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { { .name = "TEE", .revision = 1, .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __net_init tee_net_init(struct net *net) { struct tee_net *tn = net_generic(net, tee_net_id); INIT_LIST_HEAD(&tn->priv_list); mutex_init(&tn->lock); return 0; } static struct pernet_operations tee_net_ops = { .init = tee_net_init, .id = &tee_net_id, .size = sizeof(struct tee_net), }; static struct notifier_block tee_netdev_notifier = { .notifier_call = tee_netdev_event, }; static int __init tee_tg_init(void) { int ret; ret = register_pernet_subsys(&tee_net_ops); if (ret < 0) return ret; ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&tee_netdev_notifier); if (ret < 0) goto unregister_targets; return 0; unregister_targets: xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); cleanup_subsys: unregister_pernet_subsys(&tee_net_ops); return ret; } static void __exit tee_tg_exit(void) { unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); unregister_pernet_subsys(&tee_net_ops); } module_init(tee_tg_init); module_exit(tee_tg_exit); MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Reroute packet copy"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TEE"); MODULE_ALIAS("ip6t_TEE");
118 21 57 36 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FIB_RULES_H #define __NET_FIB_RULES_H #include <linux/types.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/fib_rules.h> #include <linux/refcount.h> #include <net/flow.h> #include <net/rtnetlink.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> struct fib_kuid_range { kuid_t start; kuid_t end; }; struct fib_rule { struct list_head list; int iifindex; int oifindex; u32 mark; u32 mark_mask; u32 flags; u32 table; u8 action; u8 l3mdev; u8 proto; u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; refcount_t refcnt; u32 pref; int suppress_ifgroup; int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; struct rcu_head rcu; }; struct fib_lookup_arg { void *lookup_ptr; const void *lookup_data; void *result; struct fib_rule *rule; u32 table; int flags; #define FIB_LOOKUP_NOREF 1 #define FIB_LOOKUP_IGNORE_LINKSTATE 2 }; struct fib_rules_ops { int family; struct list_head list; int rule_size; int addr_size; int unresolved_rules; int nr_goto_rules; unsigned int fib_rules_seq; int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, struct nlattr **, struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); int (*fill)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *); size_t (*nlmsg_payload)(struct fib_rule *); /* Called after modifications to the rules set, must flush * the route cache if one exists. */ void (*flush_cache)(struct fib_rules_ops *ops); int nlgroup; struct list_head rules_list; struct module *owner; struct net *fro_net; struct rcu_head rcu; }; struct fib_rule_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_rule *rule; }; static inline void fib_rule_get(struct fib_rule *rule) { refcount_inc(&rule->refcnt); } static inline void fib_rule_put(struct fib_rule *rule) { if (refcount_dec_and_test(&rule->refcnt)) kfree_rcu(rule, rcu); } #ifdef CONFIG_NET_L3_MASTER_DEV static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->l3mdev ? arg->table : rule->table; } #else static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->table; } #endif static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) { if (nla[FRA_TABLE]) return nla_get_u32(nla[FRA_TABLE]); return frh->table; } static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) { return range->start != 0 && range->end != 0; } static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, __be16 port) { return ntohs(port) >= a->start && ntohs(port) <= a->end; } static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && a->start <= a->end; } static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, struct fib_rule_port_range *b) { return a->start == b->start && a->end == b->end; } static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || fib_rule_port_range_set(&rule->sport_range) || fib_rule_port_range_set(&rule->dport_range)); } struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); #endif
2086 2086 2584 2583 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/bitmap.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) */ #include <linux/buffer_head.h> #include "ext4.h" unsigned int ext4_count_free(char *bitmap, unsigned int numchars) { return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); } int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); } else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); } else calculated &= 0xFFFF; return provided == calculated; } void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); }
1442 1208 101 1139 1139 1139 1441 1345 1345 1345 1345 1344 1345 1442 1192 101 1441 1442 1441 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/fs.h> #include <linux/iomap.h> #include "trace.h" /* * Advance to the next range we need to map. * * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully * processed - it was aborted because the extent the iomap spanned may have been * changed during the operation. In this case, the iteration behaviour is to * remap the unprocessed range of the iter, and that means we may need to remap * even when we've made no progress (i.e. iter->processed = 0). Hence the * "finished iterating" case needs to distinguish between * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we * need to remap the entire remaining range. */ static inline int iomap_iter_advance(struct iomap_iter *iter) { bool stale = iter->iomap.flags & IOMAP_F_STALE; /* handle the previous iteration (if any) */ if (iter->iomap.length) { if (iter->processed < 0) return iter->processed; if (!iter->processed && !stale) return 0; if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) return -EIO; iter->pos += iter->processed; iter->len -= iter->processed; if (!iter->len) return 0; } /* clear the state for the next iteration */ iter->processed = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); return 1; } static inline void iomap_iter_done(struct iomap_iter *iter) { WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); } /** * iomap_iter - iterate over a ranges in a file * @iter: iteration structue * @ops: iomap ops provided by the file system * * Iterate over filesystem-provided space mappings for the provided file range. * * This function handles cleanup of resources acquired for iteration when the * filesystem indicates there are no more space mappings, which means that this * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out * of the loop body: leave @iter.processed unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { int ret; if (iter->iomap.length && ops->iomap_end) { ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), iter->processed > 0 ? iter->processed : 0, iter->flags, &iter->iomap); if (ret < 0 && !iter->processed) return ret; } trace_iomap_iter(iter, ops, _RET_IP_); ret = iomap_iter_advance(iter); if (ret <= 0) return ret; ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) return ret; iomap_iter_done(iter); return 1; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 1999 - 2004 Intel Corporation. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/if_bonding.h> #include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/netlink.h> /* General definitions */ #define AD_SHORT_TIMEOUT 1 #define AD_LONG_TIMEOUT 0 #define AD_STANDBY 0x2 #define AD_MAX_TX_IN_SECOND 3 #define AD_COLLECTOR_MAX_DELAY 0 /* Timer definitions (43.4.4 in the 802.3ad standard) */ #define AD_FAST_PERIODIC_TIME 1 #define AD_SLOW_PERIODIC_TIME 30 #define AD_SHORT_TIMEOUT_TIME (3*AD_FAST_PERIODIC_TIME) #define AD_LONG_TIMEOUT_TIME (3*AD_SLOW_PERIODIC_TIME) #define AD_CHURN_DETECTION_TIME 60 #define AD_AGGREGATE_WAIT_TIME 2 /* Port Variables definitions used by the State Machines (43.4.7 in the * 802.3ad standard) */ #define AD_PORT_BEGIN 0x1 #define AD_PORT_LACP_ENABLED 0x2 #define AD_PORT_ACTOR_CHURN 0x4 #define AD_PORT_PARTNER_CHURN 0x8 #define AD_PORT_READY 0x10 #define AD_PORT_READY_N 0x20 #define AD_PORT_MATCHED 0x40 #define AD_PORT_STANDBY 0x80 #define AD_PORT_SELECTED 0x100 #define AD_PORT_MOVED 0x200 #define AD_PORT_CHURNED (AD_PORT_ACTOR_CHURN | AD_PORT_PARTNER_CHURN) /* Port Key definitions * key is determined according to the link speed, duplex and * user key (which is yet not supported) * -------------------------------------------------------------- * Port key | User key (10 bits) | Speed (5 bits) | Duplex| * -------------------------------------------------------------- * |15 6|5 1|0 */ #define AD_DUPLEX_KEY_MASKS 0x1 #define AD_SPEED_KEY_MASKS 0x3E #define AD_USER_KEY_MASKS 0xFFC0 enum ad_link_speed_type { AD_LINK_SPEED_1MBPS = 1, AD_LINK_SPEED_10MBPS, AD_LINK_SPEED_100MBPS, AD_LINK_SPEED_1000MBPS, AD_LINK_SPEED_2500MBPS, AD_LINK_SPEED_5000MBPS, AD_LINK_SPEED_10000MBPS, AD_LINK_SPEED_14000MBPS, AD_LINK_SPEED_20000MBPS, AD_LINK_SPEED_25000MBPS, AD_LINK_SPEED_40000MBPS, AD_LINK_SPEED_50000MBPS, AD_LINK_SPEED_56000MBPS, AD_LINK_SPEED_100000MBPS, AD_LINK_SPEED_200000MBPS, AD_LINK_SPEED_400000MBPS, AD_LINK_SPEED_800000MBPS, }; /* compare MAC addresses */ #define MAC_ADDRESS_EQUAL(A, B) \ ether_addr_equal_64bits((const u8 *)A, (const u8 *)B) static const u8 null_mac_addr[ETH_ALEN + 2] __long_aligned = { 0, 0, 0, 0, 0, 0 }; static const u16 ad_ticks_per_sec = 1000 / AD_TIMER_INTERVAL; static const int ad_delta_in_ticks = (AD_TIMER_INTERVAL * HZ) / 1000; const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x02 }; /* ================= main 802.3ad protocol functions ================== */ static int ad_lacpdu_send(struct port *port); static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); static void ad_periodic_machine(struct port *port, struct bond_params *bond_params); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); static void ad_clear_agg(struct aggregator *aggregator); static void ad_initialize_agg(struct aggregator *aggregator); static void ad_initialize_port(struct port *port, int lacp_fast); static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port); static void ad_marker_response_received(struct bond_marker *marker, struct port *port); static void ad_update_actor_keys(struct port *port, bool reset); /* ================= api to bonding and kernel code ================== */ /** * __get_bond_by_port - get the port's bonding struct * @port: the port we're looking at * * Return @port's bonding struct, or %NULL if it can't be found. */ static inline struct bonding *__get_bond_by_port(struct port *port) { if (port->slave == NULL) return NULL; return bond_get_bond_by_slave(port->slave); } /** * __get_first_agg - get the first aggregator in the bond * @port: the port we're looking at * * Return the aggregator of the first slave in @bond, or %NULL if it can't be * found. * The caller must hold RCU or RTNL lock. */ static inline struct aggregator *__get_first_agg(struct port *port) { struct bonding *bond = __get_bond_by_port(port); struct slave *first_slave; struct aggregator *agg; /* If there's no bond for this port, or bond has no slaves */ if (bond == NULL) return NULL; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); agg = first_slave ? &(SLAVE_AD_INFO(first_slave)->aggregator) : NULL; rcu_read_unlock(); return agg; } /** * __agg_has_partner - see if we have a partner * @agg: the agregator we're looking at * * Return nonzero if aggregator has a partner (denoted by a non-zero ether * address for the partner). Return 0 if not. */ static inline int __agg_has_partner(struct aggregator *agg) { return !is_zero_ether_addr(agg->partner_system.mac_addr_value); } /** * __disable_port - disable the port's slave * @port: the port we're looking at */ static inline void __disable_port(struct port *port) { bond_set_slave_inactive_flags(port->slave, BOND_SLAVE_NOTIFY_LATER); } /** * __enable_port - enable the port's slave, if it's up * @port: the port we're looking at */ static inline void __enable_port(struct port *port) { struct slave *slave = port->slave; if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave)) bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER); } /** * __port_is_enabled - check if the port's slave is in active state * @port: the port we're looking at */ static inline int __port_is_enabled(struct port *port) { return bond_is_active_slave(port->slave); } /** * __get_agg_selection_mode - get the aggregator selection mode * @port: the port we're looking at * * Get the aggregator selection mode. Can be %STABLE, %BANDWIDTH or %COUNT. */ static inline u32 __get_agg_selection_mode(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return BOND_AD_STABLE; return bond->params.ad_select; } /** * __check_agg_selection_timer - check if the selection timer has expired * @port: the port we're looking at */ static inline int __check_agg_selection_timer(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return 0; return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** * __get_link_speed - get a port's speed * @port: the port we're looking at * * Return @port's speed in 802.3ad enum format. i.e. one of: * 0, * %AD_LINK_SPEED_10MBPS, * %AD_LINK_SPEED_100MBPS, * %AD_LINK_SPEED_1000MBPS, * %AD_LINK_SPEED_2500MBPS, * %AD_LINK_SPEED_5000MBPS, * %AD_LINK_SPEED_10000MBPS * %AD_LINK_SPEED_14000MBPS, * %AD_LINK_SPEED_20000MBPS * %AD_LINK_SPEED_25000MBPS * %AD_LINK_SPEED_40000MBPS * %AD_LINK_SPEED_50000MBPS * %AD_LINK_SPEED_56000MBPS * %AD_LINK_SPEED_100000MBPS * %AD_LINK_SPEED_200000MBPS * %AD_LINK_SPEED_400000MBPS * %AD_LINK_SPEED_800000MBPS */ static u16 __get_link_speed(struct port *port) { struct slave *slave = port->slave; u16 speed; /* this if covers only a special case: when the configuration starts * with link down, it sets the speed to 0. * This is done in spite of the fact that the e100 driver reports 0 * to be compatible with MVT in the future. */ if (slave->link != BOND_LINK_UP) speed = 0; else { switch (slave->speed) { case SPEED_10: speed = AD_LINK_SPEED_10MBPS; break; case SPEED_100: speed = AD_LINK_SPEED_100MBPS; break; case SPEED_1000: speed = AD_LINK_SPEED_1000MBPS; break; case SPEED_2500: speed = AD_LINK_SPEED_2500MBPS; break; case SPEED_5000: speed = AD_LINK_SPEED_5000MBPS; break; case SPEED_10000: speed = AD_LINK_SPEED_10000MBPS; break; case SPEED_14000: speed = AD_LINK_SPEED_14000MBPS; break; case SPEED_20000: speed = AD_LINK_SPEED_20000MBPS; break; case SPEED_25000: speed = AD_LINK_SPEED_25000MBPS; break; case SPEED_40000: speed = AD_LINK_SPEED_40000MBPS; break; case SPEED_50000: speed = AD_LINK_SPEED_50000MBPS; break; case SPEED_56000: speed = AD_LINK_SPEED_56000MBPS; break; case SPEED_100000: speed = AD_LINK_SPEED_100000MBPS; break; case SPEED_200000: speed = AD_LINK_SPEED_200000MBPS; break; case SPEED_400000: speed = AD_LINK_SPEED_400000MBPS; break; case SPEED_800000: speed = AD_LINK_SPEED_800000MBPS; break; default: /* unknown speed value from ethtool. shouldn't happen */ if (slave->speed != SPEED_UNKNOWN) pr_err_once("%s: (slave %s): unknown ethtool speed (%d) for port %d (set it to 0)\n", slave->bond->dev->name, slave->dev->name, slave->speed, port->actor_port_number); speed = 0; break; } } slave_dbg(slave->bond->dev, slave->dev, "Port %d Received link speed %d update from adapter\n", port->actor_port_number, speed); return speed; } /** * __get_duplex - get a port's duplex * @port: the port we're looking at * * Return @port's duplex in 802.3ad bitmask format. i.e.: * 0x01 if in full duplex * 0x00 otherwise */ static u8 __get_duplex(struct port *port) { struct slave *slave = port->slave; u8 retval = 0x0; /* handling a special case: when the configuration starts with * link down, it sets the duplex to 0. */ if (slave->link == BOND_LINK_UP) { switch (slave->duplex) { case DUPLEX_FULL: retval = 0x1; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status full duplex update from adapter\n", port->actor_port_number); break; case DUPLEX_HALF: default: retval = 0x0; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status NOT full duplex update from adapter\n", port->actor_port_number); break; } } return retval; } static void __ad_actor_update_port(struct port *port) { const struct bonding *bond = bond_get_bond_by_slave(port->slave); port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; } /* Conversions */ /** * __ad_timer_to_ticks - convert a given timer type to AD module ticks * @timer_type: which timer to operate * @par: timer parameter. see below * * If @timer_type is %current_while_timer, @par indicates long/short timer. * If @timer_type is %periodic_timer, @par is one of %FAST_PERIODIC_TIME, * %SLOW_PERIODIC_TIME. */ static u16 __ad_timer_to_ticks(u16 timer_type, u16 par) { u16 retval = 0; /* to silence the compiler */ switch (timer_type) { case AD_CURRENT_WHILE_TIMER: /* for rx machine usage */ if (par) retval = (AD_SHORT_TIMEOUT_TIME*ad_ticks_per_sec); else retval = (AD_LONG_TIMEOUT_TIME*ad_ticks_per_sec); break; case AD_ACTOR_CHURN_TIMER: /* for local churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_PERIODIC_TIMER: /* for periodic machine */ retval = (par*ad_ticks_per_sec); /* long timeout */ break; case AD_PARTNER_CHURN_TIMER: /* for remote churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_WAIT_WHILE_TIMER: /* for selection machine */ retval = (AD_AGGREGATE_WAIT_TIME*ad_ticks_per_sec); break; } return retval; } /* ================= ad_rx_machine helper functions ================== */ /** * __choose_matched - update a port's matched variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the matched variable, using parameter values from a * newly received lacpdu. Parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the actor. Matched is set to TRUE if all of these parameters * match and the PDU parameter partner_state.aggregation has the same value as * actor_oper_port_state.aggregation and lacp will actively maintain the link * in the aggregation. Matched is also set to TRUE if the value of * actor_state.aggregation in the received PDU is set to FALSE, i.e., indicates * an individual link and lacp will actively maintain the link. Otherwise, * matched is set to FALSE. LACP is considered to be actively maintaining the * link if either the PDU's actor_state.lacp_activity variable is TRUE or both * the actor's actor_oper_port_state.lacp_activity and the PDU's * partner_state.lacp_activity variables are TRUE. * * Note: the AD_PORT_MATCHED "variable" is not specified by 802.3ad; it is * used here to implement the language from 802.3ad 43.4.9 that requires * recordPDU to "match" the LACPDU parameters to the stored values. */ static void __choose_matched(struct lacpdu *lacpdu, struct port *port) { /* check if all parameters are alike * or this is individual link(aggregation == FALSE) * then update the state machine Matched variable. */ if (((ntohs(lacpdu->partner_port) == port->actor_port_number) && (ntohs(lacpdu->partner_port_priority) == port->actor_port_priority) && MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) && (ntohs(lacpdu->partner_system_priority) == port->actor_system_priority) && (ntohs(lacpdu->partner_key) == port->actor_oper_port_key) && ((lacpdu->partner_state & LACP_STATE_AGGREGATION) == (port->actor_oper_port_state & LACP_STATE_AGGREGATION))) || ((lacpdu->actor_state & LACP_STATE_AGGREGATION) == 0) ) { port->sm_vars |= AD_PORT_MATCHED; } else { port->sm_vars &= ~AD_PORT_MATCHED; } } /** * __record_pdu - record parameters from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Record the parameter values for the Actor carried in a received lacpdu as * the current partner operational parameter values and sets * actor_oper_port_state.defaulted to FALSE. */ static void __record_pdu(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { struct port_params *partner = &port->partner_oper; __choose_matched(lacpdu, port); /* record the new parameter values for the partner * operational */ partner->port_number = ntohs(lacpdu->actor_port); partner->port_priority = ntohs(lacpdu->actor_port_priority); partner->system = lacpdu->actor_system; partner->system_priority = ntohs(lacpdu->actor_system_priority); partner->key = ntohs(lacpdu->actor_key); partner->port_state = lacpdu->actor_state; /* set actor_oper_port_state.defaulted to FALSE */ port->actor_oper_port_state &= ~LACP_STATE_DEFAULTED; /* set the partner sync. to on if the partner is sync, * and the port is matched */ if ((port->sm_vars & AD_PORT_MATCHED) && (lacpdu->actor_state & LACP_STATE_SYNCHRONIZATION)) { partner->port_state |= LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=1\n"); } else { partner->port_state &= ~LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=0\n"); } } } /** * __record_default - record default parameters * @port: the port we're looking at * * This function records the default parameter values for the partner carried * in the Partner Admin parameters as the current partner operational parameter * values and sets actor_oper_port_state.defaulted to TRUE. */ static void __record_default(struct port *port) { if (port) { /* record the partner admin parameters */ memcpy(&port->partner_oper, &port->partner_admin, sizeof(struct port_params)); /* set actor_oper_port_state.defaulted to true */ port->actor_oper_port_state |= LACP_STATE_DEFAULTED; } } /** * __update_selected - update a port's Selected variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the selected variable, using parameter values from a * newly received lacpdu. The parameter values for the Actor carried in the * received PDU are compared with the corresponding operational parameter * values for the ports partner. If one or more of the comparisons shows that * the value(s) received in the PDU differ from the current operational values, * then selected is set to FALSE and actor_oper_port_state.synchronization is * set to out_of_sync. Otherwise, selected remains unchanged. */ static void __update_selected(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { const struct port_params *partner = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (ntohs(lacpdu->actor_port) != partner->port_number || ntohs(lacpdu->actor_port_priority) != partner->port_priority || !MAC_ADDRESS_EQUAL(&lacpdu->actor_system, &partner->system) || ntohs(lacpdu->actor_system_priority) != partner->system_priority || ntohs(lacpdu->actor_key) != partner->key || (lacpdu->actor_state & LACP_STATE_AGGREGATION) != (partner->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_default_selected - update a port's Selected variable from Partner * @port: the port we're looking at * * This function updates the value of the selected variable, using the partner * administrative parameter values. The administrative values are compared with * the corresponding operational parameter values for the partner. If one or * more of the comparisons shows that the administrative value(s) differ from * the current operational values, then Selected is set to FALSE and * actor_oper_port_state.synchronization is set to OUT_OF_SYNC. Otherwise, * Selected remains unchanged. */ static void __update_default_selected(struct port *port) { if (port) { const struct port_params *admin = &port->partner_admin; const struct port_params *oper = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (admin->port_number != oper->port_number || admin->port_priority != oper->port_priority || !MAC_ADDRESS_EQUAL(&admin->system, &oper->system) || admin->system_priority != oper->system_priority || admin->key != oper->key || (admin->port_state & LACP_STATE_AGGREGATION) != (oper->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_ntt - update a port's ntt variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Updates the value of the ntt variable, using parameter values from a newly * received lacpdu. The parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the Actor. If one or more of the comparisons shows that the * value(s) received in the PDU differ from the current operational values, * then ntt is set to TRUE. Otherwise, ntt remains unchanged. */ static void __update_ntt(struct lacpdu *lacpdu, struct port *port) { /* validate lacpdu and port */ if (lacpdu && port) { /* check if any parameter is different then * update the port->ntt. */ if ((ntohs(lacpdu->partner_port) != port->actor_port_number) || (ntohs(lacpdu->partner_port_priority) != port->actor_port_priority) || !MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) || (ntohs(lacpdu->partner_system_priority) != port->actor_system_priority) || (ntohs(lacpdu->partner_key) != port->actor_oper_port_key) || ((lacpdu->partner_state & LACP_STATE_LACP_ACTIVITY) != (port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY)) || ((lacpdu->partner_state & LACP_STATE_LACP_TIMEOUT) != (port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)) || ((lacpdu->partner_state & LACP_STATE_SYNCHRONIZATION) != (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) || ((lacpdu->partner_state & LACP_STATE_AGGREGATION) != (port->actor_oper_port_state & LACP_STATE_AGGREGATION)) ) { port->ntt = true; } } } /** * __agg_ports_are_ready - check if all ports in an aggregator are ready * @aggregator: the aggregator we're looking at * */ static int __agg_ports_are_ready(struct aggregator *aggregator) { struct port *port; int retval = 1; if (aggregator) { /* scan all ports in this aggregator to verfy if they are * all ready. */ for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (!(port->sm_vars & AD_PORT_READY_N)) { retval = 0; break; } } } return retval; } /** * __set_agg_ports_ready - set value of Ready bit in all ports of an aggregator * @aggregator: the aggregator we're looking at * @val: Should the ports' ready bit be set on or off * */ static void __set_agg_ports_ready(struct aggregator *aggregator, int val) { struct port *port; for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (val) port->sm_vars |= AD_PORT_READY; else port->sm_vars &= ~AD_PORT_READY; } } static int __agg_active_ports(struct aggregator *agg) { struct port *port; int active = 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (port->is_enabled) active++; } return active; } /** * __get_agg_bandwidth - get the total bandwidth of an aggregator * @aggregator: the aggregator we're looking at * */ static u32 __get_agg_bandwidth(struct aggregator *aggregator) { int nports = __agg_active_ports(aggregator); u32 bandwidth = 0; if (nports) { switch (__get_link_speed(aggregator->lag_ports)) { case AD_LINK_SPEED_1MBPS: bandwidth = nports; break; case AD_LINK_SPEED_10MBPS: bandwidth = nports * 10; break; case AD_LINK_SPEED_100MBPS: bandwidth = nports * 100; break; case AD_LINK_SPEED_1000MBPS: bandwidth = nports * 1000; break; case AD_LINK_SPEED_2500MBPS: bandwidth = nports * 2500; break; case AD_LINK_SPEED_5000MBPS: bandwidth = nports * 5000; break; case AD_LINK_SPEED_10000MBPS: bandwidth = nports * 10000; break; case AD_LINK_SPEED_14000MBPS: bandwidth = nports * 14000; break; case AD_LINK_SPEED_20000MBPS: bandwidth = nports * 20000; break; case AD_LINK_SPEED_25000MBPS: bandwidth = nports * 25000; break; case AD_LINK_SPEED_40000MBPS: bandwidth = nports * 40000; break; case AD_LINK_SPEED_50000MBPS: bandwidth = nports * 50000; break; case AD_LINK_SPEED_56000MBPS: bandwidth = nports * 56000; break; case AD_LINK_SPEED_100000MBPS: bandwidth = nports * 100000; break; case AD_LINK_SPEED_200000MBPS: bandwidth = nports * 200000; break; case AD_LINK_SPEED_400000MBPS: bandwidth = nports * 400000; break; case AD_LINK_SPEED_800000MBPS: bandwidth = nports * 800000; break; default: bandwidth = 0; /* to silence the compiler */ } } return bandwidth; } /** * __get_active_agg - get the current active aggregator * @aggregator: the aggregator we're looking at * * Caller must hold RCU lock. */ static struct aggregator *__get_active_agg(struct aggregator *aggregator) { struct bonding *bond = aggregator->slave->bond; struct list_head *iter; struct slave *slave; bond_for_each_slave_rcu(bond, slave, iter) if (SLAVE_AD_INFO(slave)->aggregator.is_active) return &(SLAVE_AD_INFO(slave)->aggregator); return NULL; } /** * __update_lacpdu_from_port - update a port's lacpdu fields * @port: the port we're looking at */ static inline void __update_lacpdu_from_port(struct port *port) { struct lacpdu *lacpdu = &port->lacpdu; const struct port_params *partner = &port->partner_oper; /* update current actual Actor parameters * lacpdu->subtype initialized * lacpdu->version_number initialized * lacpdu->tlv_type_actor_info initialized * lacpdu->actor_information_length initialized */ lacpdu->actor_system_priority = htons(port->actor_system_priority); lacpdu->actor_system = port->actor_system; lacpdu->actor_key = htons(port->actor_oper_port_key); lacpdu->actor_port_priority = htons(port->actor_port_priority); lacpdu->actor_port = htons(port->actor_port_number); lacpdu->actor_state = port->actor_oper_port_state; slave_dbg(port->slave->bond->dev, port->slave->dev, "update lacpdu: actor port state %x\n", port->actor_oper_port_state); /* lacpdu->reserved_3_1 initialized * lacpdu->tlv_type_partner_info initialized * lacpdu->partner_information_length initialized */ lacpdu->partner_system_priority = htons(partner->system_priority); lacpdu->partner_system = partner->system; lacpdu->partner_key = htons(partner->key); lacpdu->partner_port_priority = htons(partner->port_priority); lacpdu->partner_port = htons(partner->port_number); lacpdu->partner_state = partner->port_state; /* lacpdu->reserved_3_2 initialized * lacpdu->tlv_type_collector_info initialized * lacpdu->collector_information_length initialized * collector_max_delay initialized * reserved_12[12] initialized * tlv_type_terminator initialized * terminator_length initialized * reserved_50[50] initialized */ } /* ================= main 802.3ad protocol code ========================= */ /** * ad_lacpdu_send - send out a lacpdu packet on a given port * @port: the port we're looking at * * Returns: 0 on success * < 0 on error */ static int ad_lacpdu_send(struct port *port) { struct slave *slave = port->slave; struct sk_buff *skb; struct lacpdu_header *lacpdu_header; int length = sizeof(struct lacpdu_header); skb = dev_alloc_skb(length); if (!skb) return -ENOMEM; atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.lacpdu_tx); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; skb->priority = TC_PRIO_CONTROL; lacpdu_header = skb_put(skb, length); ether_addr_copy(lacpdu_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback lacpdus in receive. */ ether_addr_copy(lacpdu_header->hdr.h_source, slave->perm_hwaddr); lacpdu_header->hdr.h_proto = PKT_TYPE_LACPDU; lacpdu_header->lacpdu = port->lacpdu; dev_queue_xmit(skb); return 0; } /** * ad_marker_send - send marker information/response on a given port * @port: the port we're looking at * @marker: marker data to send * * Returns: 0 on success * < 0 on error */ static int ad_marker_send(struct port *port, struct bond_marker *marker) { struct slave *slave = port->slave; struct sk_buff *skb; struct bond_marker_header *marker_header; int length = sizeof(struct bond_marker_header); skb = dev_alloc_skb(length + 16); if (!skb) return -ENOMEM; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_tx); break; case AD_MARKER_RESPONSE_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_resp_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_resp_tx); break; } skb_reserve(skb, 16); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; marker_header = skb_put(skb, length); ether_addr_copy(marker_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback MARKERs in receive. */ ether_addr_copy(marker_header->hdr.h_source, slave->perm_hwaddr); marker_header->hdr.h_proto = PKT_TYPE_LACPDU; marker_header->marker = *marker; dev_queue_xmit(skb); return 0; } /** * ad_mux_machine - handle a port's mux state machine * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_mux_machine(struct port *port, bool *update_slave_arr) { mux_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_mux_state; if (port->sm_vars & AD_PORT_BEGIN) { port->sm_mux_state = AD_MUX_DETACHED; } else { switch (port->sm_mux_state) { case AD_MUX_DETACHED: if ((port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) /* if SELECTED or STANDBY */ port->sm_mux_state = AD_MUX_WAITING; break; case AD_MUX_WAITING: /* if SELECTED == FALSE return to DETACH state */ if (!(port->sm_vars & AD_PORT_SELECTED)) { port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the Selection Logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; break; } /* check if the wait_while_timer expired */ if (port->sm_mux_timer_counter && !(--port->sm_mux_timer_counter)) port->sm_vars |= AD_PORT_READY_N; /* in order to withhold the selection logic to check * all ports READY_N value every callback cycle to * update ready variable, we check READY_N and update * READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); /* if the wait_while_timer expired, and the port is * in READY state, move to ATTACHED state */ if ((port->sm_vars & AD_PORT_READY) && !port->sm_mux_timer_counter) port->sm_mux_state = AD_MUX_ATTACHED; break; case AD_MUX_ATTACHED: /* check also if agg_select_timer expired (so the * edable port will take place only after this timer) */ if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && !__check_agg_selection_timer(port)) { if (port->aggregator->is_active) port->sm_mux_state = AD_MUX_COLLECTING_DISTRIBUTING; } else if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) { /* if UNSELECTED or STANDBY */ port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the selection logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; } else if (port->aggregator->is_active) { port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; } break; case AD_MUX_COLLECTING_DISTRIBUTING: if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY) || !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) || !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) { port->sm_mux_state = AD_MUX_ATTACHED; } else { /* if port state hasn't changed make * sure that a collecting distributing * port in an active aggregator is enabled */ if (port->aggregator && port->aggregator->is_active && !__port_is_enabled(port)) { __enable_port(port); *update_slave_arr = true; } } break; default: break; } } /* check if the state machine was changed */ if (port->sm_mux_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Mux Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_mux_state); switch (port->sm_mux_state) { case AD_MUX_DETACHED: port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; ad_disable_collecting_distributing(port, update_slave_arr); port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->ntt = true; break; case AD_MUX_WAITING: port->sm_mux_timer_counter = __ad_timer_to_ticks(AD_WAIT_WHILE_TIMER, 0); break; case AD_MUX_ATTACHED: if (port->aggregator->is_active) port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; else port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; ad_disable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_COLLECTING_DISTRIBUTING: port->actor_oper_port_state |= LACP_STATE_COLLECTING; port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; default: break; } } } /** * ad_rx_machine - handle a port's rx State Machine * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * If lacpdu arrived, stop previous timer (if exists) and set the next state as * CURRENT. If timer expired set the state machine in the proper state. * In other cases, this function checks if we need to switch to other state. */ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) { rx_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_rx_state; if (lacpdu) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.lacpdu_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.lacpdu_rx); } /* check if state machine should change state */ /* first, check if port was reinitialized */ if (port->sm_vars & AD_PORT_BEGIN) { port->sm_rx_state = AD_RX_INITIALIZE; port->sm_vars |= AD_PORT_CHURNED; /* check if port is not enabled */ } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled) port->sm_rx_state = AD_RX_PORT_DISABLED; /* check if new lacpdu arrived */ else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) || (port->sm_rx_state == AD_RX_DEFAULTED) || (port->sm_rx_state == AD_RX_CURRENT))) { if (port->sm_rx_state != AD_RX_CURRENT) port->sm_vars |= AD_PORT_CHURNED; port->sm_rx_timer_counter = 0; port->sm_rx_state = AD_RX_CURRENT; } else { /* if timer is on, and if it is expired */ if (port->sm_rx_timer_counter && !(--port->sm_rx_timer_counter)) { switch (port->sm_rx_state) { case AD_RX_EXPIRED: port->sm_rx_state = AD_RX_DEFAULTED; break; case AD_RX_CURRENT: port->sm_rx_state = AD_RX_EXPIRED; break; default: break; } } else { /* if no lacpdu arrived and no timer is on */ switch (port->sm_rx_state) { case AD_RX_PORT_DISABLED: if (port->is_enabled && (port->sm_vars & AD_PORT_LACP_ENABLED)) port->sm_rx_state = AD_RX_EXPIRED; else if (port->is_enabled && ((port->sm_vars & AD_PORT_LACP_ENABLED) == 0)) port->sm_rx_state = AD_RX_LACP_DISABLED; break; default: break; } } } /* check if the State machine was changed or new lacpdu arrived */ if ((port->sm_rx_state != last_state) || (lacpdu)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Rx Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_rx_state); switch (port->sm_rx_state) { case AD_RX_INITIALIZE: if (!(port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)) port->sm_vars &= ~AD_PORT_LACP_ENABLED; else port->sm_vars |= AD_PORT_LACP_ENABLED; port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; port->sm_rx_state = AD_RX_PORT_DISABLED; fallthrough; case AD_RX_PORT_DISABLED: port->sm_vars &= ~AD_PORT_MATCHED; break; case AD_RX_LACP_DISABLED: port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->partner_oper.port_state &= ~LACP_STATE_AGGREGATION; port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_EXPIRED: /* Reset of the Synchronization flag (Standard 43.4.12) * This reset cause to disable this port in the * COLLECTING_DISTRIBUTING state of the mux machine in * case of EXPIRED even if LINK_DOWN didn't arrive for * the port. */ port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->sm_vars &= ~AD_PORT_MATCHED; port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT; port->partner_oper.port_state |= LACP_STATE_LACP_ACTIVITY; port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT)); port->actor_oper_port_state |= LACP_STATE_EXPIRED; port->sm_vars |= AD_PORT_CHURNED; break; case AD_RX_DEFAULTED: __update_default_selected(port); __record_default(port); port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_CURRENT: /* detect loopback situation */ if (MAC_ADDRESS_EQUAL(&(lacpdu->actor_system), &(port->actor_system))) { slave_err(port->slave->bond->dev, port->slave->dev, "An illegal loopback occurred on slave\n" "Check the configuration to verify that all adapters are connected to 802.3ad compliant switch ports\n"); return; } __update_selected(lacpdu, port); __update_ntt(lacpdu, port); __record_pdu(lacpdu, port); port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; default: break; } } } /** * ad_churn_machine - handle port churn's state machine * @port: the port we're looking at * */ static void ad_churn_machine(struct port *port) { if (port->sm_vars & AD_PORT_CHURNED) { port->sm_vars &= ~AD_PORT_CHURNED; port->sm_churn_actor_state = AD_CHURN_MONITOR; port->sm_churn_partner_state = AD_CHURN_MONITOR; port->sm_churn_actor_timer_counter = __ad_timer_to_ticks(AD_ACTOR_CHURN_TIMER, 0); port->sm_churn_partner_timer_counter = __ad_timer_to_ticks(AD_PARTNER_CHURN_TIMER, 0); return; } if (port->sm_churn_actor_timer_counter && !(--port->sm_churn_actor_timer_counter) && port->sm_churn_actor_state == AD_CHURN_MONITOR) { if (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_actor_state = AD_NO_CHURN; } else { port->churn_actor_count++; port->sm_churn_actor_state = AD_CHURN; } } if (port->sm_churn_partner_timer_counter && !(--port->sm_churn_partner_timer_counter) && port->sm_churn_partner_state == AD_CHURN_MONITOR) { if (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_partner_state = AD_NO_CHURN; } else { port->churn_partner_count++; port->sm_churn_partner_state = AD_CHURN; } } } /** * ad_tx_machine - handle a port's tx state machine * @port: the port we're looking at */ static void ad_tx_machine(struct port *port) { /* check if tx timer expired, to verify that we do not send more than * 3 packets per second */ if (port->sm_tx_timer_counter && !(--port->sm_tx_timer_counter)) { /* check if there is something to send */ if (port->ntt && (port->sm_vars & AD_PORT_LACP_ENABLED)) { __update_lacpdu_from_port(port); if (ad_lacpdu_send(port) >= 0) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent LACPDU on port %d\n", port->actor_port_number); /* mark ntt as false, so it will not be sent * again until demanded */ port->ntt = false; } } /* restart tx timer(to verify that we will not exceed * AD_MAX_TX_IN_SECOND */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; } } /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at * @bond_params: bond parameters we will use * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ static void ad_periodic_machine(struct port *port, struct bond_params *bond_params) { periodic_states_t last_state; /* keep current state machine state to compare later if it was changed */ last_state = port->sm_periodic_state; /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || !bond_params->lacp_active) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ else if (port->sm_periodic_timer_counter) { /* check if periodic state machine expired */ if (!(--port->sm_periodic_timer_counter)) { /* if expired then do tx */ port->sm_periodic_state = AD_PERIODIC_TX; } else { /* If not expired, check if there is some new timeout * parameter from the partner state */ switch (port->sm_periodic_state) { case AD_FAST_PERIODIC: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; break; case AD_SLOW_PERIODIC: if ((port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) { port->sm_periodic_timer_counter = 0; port->sm_periodic_state = AD_PERIODIC_TX; } break; default: break; } } } else { switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_state = AD_FAST_PERIODIC; break; case AD_PERIODIC_TX: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; else port->sm_periodic_state = AD_FAST_PERIODIC; break; default: break; } } /* check if the state machine was changed */ if (port->sm_periodic_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Periodic Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_periodic_state); switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_timer_counter = 0; break; case AD_FAST_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_FAST_PERIODIC_TIME))-1; break; case AD_SLOW_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_SLOW_PERIODIC_TIME))-1; break; case AD_PERIODIC_TX: port->ntt = true; break; default: break; } } } /** * ad_port_selection_logic - select aggregation groups * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Select aggregation groups, and assign each port for it's aggregetor. The * selection logic is called in the inititalization (after all the handshkes), * and after every lacpdu receive (if selected is off). */ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) { struct aggregator *aggregator, *free_aggregator = NULL, *temp_aggregator; struct port *last_port = NULL, *curr_port; struct list_head *iter; struct bonding *bond; struct slave *slave; int found = 0; /* if the port is already Selected, do nothing */ if (port->sm_vars & AD_PORT_SELECTED) return; bond = __get_bond_by_port(port); /* if the port is connected to other aggregator, detach it */ if (port->aggregator) { /* detach the port from its former aggregator */ temp_aggregator = port->aggregator; for (curr_port = temp_aggregator->lag_ports; curr_port; last_port = curr_port, curr_port = curr_port->next_port_in_aggregator) { if (curr_port == port) { temp_aggregator->num_of_ports--; /* if it is the first port attached to the * aggregator */ if (!last_port) { temp_aggregator->lag_ports = port->next_port_in_aggregator; } else { /* not the first port attached to the * aggregator */ last_port->next_port_in_aggregator = port->next_port_in_aggregator; } /* clear the port's relations to this * aggregator */ port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->actor_port_aggregator_identifier = 0; slave_dbg(bond->dev, port->slave->dev, "Port %d left LAG %d\n", port->actor_port_number, temp_aggregator->aggregator_identifier); /* if the aggregator is empty, clear its * parameters, and set it ready to be attached */ if (!temp_aggregator->lag_ports) ad_clear_agg(temp_aggregator); break; } } if (!curr_port) { /* meaning: the port was related to an aggregator * but was not on the aggregator port list */ net_warn_ratelimited("%s: (slave %s): Warning: Port %d was related to aggregator %d but was not on its port list\n", port->slave->bond->dev->name, port->slave->dev->name, port->actor_port_number, port->aggregator->aggregator_identifier); } } /* search on all aggregators for a suitable aggregator for this port */ bond_for_each_slave(bond, slave, iter) { aggregator = &(SLAVE_AD_INFO(slave)->aggregator); /* keep a free aggregator for later use(if needed) */ if (!aggregator->lag_ports) { if (!free_aggregator) free_aggregator = aggregator; continue; } /* check if current aggregator suits us */ if (((aggregator->actor_oper_aggregator_key == port->actor_oper_port_key) && /* if all parameters match AND */ MAC_ADDRESS_EQUAL(&(aggregator->partner_system), &(port->partner_oper.system)) && (aggregator->partner_system_priority == port->partner_oper.system_priority) && (aggregator->partner_oper_aggregator_key == port->partner_oper.key) ) && ((!MAC_ADDRESS_EQUAL(&(port->partner_oper.system), &(null_mac_addr)) && /* partner answers */ !aggregator->is_individual) /* but is not individual OR */ ) ) { /* attach to the founded aggregator */ port->aggregator = aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; port->next_port_in_aggregator = aggregator->lag_ports; port->aggregator->num_of_ports++; aggregator->lag_ports = port; slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; found = 1; break; } } /* the port couldn't find an aggregator - attach it to a new * aggregator */ if (!found) { if (free_aggregator) { /* assign port a new aggregator */ port->aggregator = free_aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; /* update the new aggregator's parameters * if port was responsed from the end-user */ if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS) /* if port is full duplex */ port->aggregator->is_individual = false; else port->aggregator->is_individual = true; port->aggregator->actor_admin_aggregator_key = port->actor_admin_port_key; port->aggregator->actor_oper_aggregator_key = port->actor_oper_port_key; port->aggregator->partner_system = port->partner_oper.system; port->aggregator->partner_system_priority = port->partner_oper.system_priority; port->aggregator->partner_oper_aggregator_key = port->partner_oper.key; port->aggregator->receive_state = 1; port->aggregator->transmit_state = 1; port->aggregator->lag_ports = port; port->aggregator->num_of_ports++; /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); } else { slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", port->actor_port_number); return; } } /* if all aggregator's ports are READY_N == TRUE, set ready=TRUE * in all aggregator's ports, else set ready=FALSE in all * aggregator's ports */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, update_slave_arr); if (!port->aggregator->is_active) port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; } /* Decide if "agg" is a better choice for the new active aggregator that * the current best, according to the ad_select policy. */ static struct aggregator *ad_agg_selection_test(struct aggregator *best, struct aggregator *curr) { /* 0. If no best, select current. * * 1. If the current agg is not individual, and the best is * individual, select current. * * 2. If current agg is individual and the best is not, keep best. * * 3. Therefore, current and best are both individual or both not * individual, so: * * 3a. If current agg partner replied, and best agg partner did not, * select current. * * 3b. If current agg partner did not reply and best agg partner * did reply, keep best. * * 4. Therefore, current and best both have partner replies or * both do not, so perform selection policy: * * BOND_AD_COUNT: Select by count of ports. If count is equal, * select by bandwidth. * * BOND_AD_STABLE, BOND_AD_BANDWIDTH: Select by bandwidth. */ if (!best) return curr; if (!curr->is_individual && best->is_individual) return curr; if (curr->is_individual && !best->is_individual) return best; if (__agg_has_partner(curr) && !__agg_has_partner(best)) return curr; if (!__agg_has_partner(curr) && __agg_has_partner(best)) return best; switch (__get_agg_selection_mode(curr->lag_ports)) { case BOND_AD_COUNT: if (__agg_active_ports(curr) > __agg_active_ports(best)) return curr; if (__agg_active_ports(curr) < __agg_active_ports(best)) return best; fallthrough; case BOND_AD_STABLE: case BOND_AD_BANDWIDTH: if (__get_agg_bandwidth(curr) > __get_agg_bandwidth(best)) return curr; break; default: net_warn_ratelimited("%s: (slave %s): Impossible agg select mode %d\n", curr->slave->bond->dev->name, curr->slave->dev->name, __get_agg_selection_mode(curr->lag_ports)); break; } return best; } static int agg_device_up(const struct aggregator *agg) { struct port *port = agg->lag_ports; if (!port) return 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (netif_running(port->slave->dev) && netif_carrier_ok(port->slave->dev)) return 1; } return 0; } /** * ad_agg_selection_logic - select an aggregation group for a team * @agg: the aggregator we're looking at * @update_slave_arr: Does slave array need update? * * It is assumed that only one aggregator may be selected for a team. * * The logic of this function is to select the aggregator according to * the ad_select policy: * * BOND_AD_STABLE: select the aggregator with the most ports attached to * it, and to reselect the active aggregator only if the previous * aggregator has no more ports related to it. * * BOND_AD_BANDWIDTH: select the aggregator with the highest total * bandwidth, and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * BOND_AD_COUNT: select the aggregator with largest number of ports * (slaves), and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * FIXME: this function MUST be called with the first agg in the bond, or * __get_active_agg() won't work correctly. This function should be better * called with the bond itself, and retrieve the first agg from it. */ static void ad_agg_selection_logic(struct aggregator *agg, bool *update_slave_arr) { struct aggregator *best, *active, *origin; struct bonding *bond = agg->slave->bond; struct list_head *iter; struct slave *slave; struct port *port; rcu_read_lock(); origin = agg; active = __get_active_agg(agg); best = (active && agg_device_up(active)) ? active : NULL; bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); agg->is_active = 0; if (__agg_active_ports(agg) && agg_device_up(agg)) best = ad_agg_selection_test(best, agg); } if (best && __get_agg_selection_mode(best->lag_ports) == BOND_AD_STABLE) { /* For the STABLE policy, don't replace the old active * aggregator if it's still active (it has an answering * partner) or if both the best and active don't have an * answering partner. */ if (active && active->lag_ports && __agg_active_ports(active) && (__agg_has_partner(active) || (!__agg_has_partner(active) && !__agg_has_partner(best)))) { if (!(!active->actor_oper_aggregator_key && best->actor_oper_aggregator_key)) { best = NULL; active->is_active = 1; } } } if (best && (best == active)) { best = NULL; active->is_active = 1; } /* if there is new best aggregator, activate it */ if (best) { netdev_dbg(bond->dev, "(slave %s): best Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); netdev_dbg(bond->dev, "(slave %s): best ports %p slave %p\n", best->slave ? best->slave->dev->name : "NULL", best->lag_ports, best->slave); bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); slave_dbg(bond->dev, slave->dev, "Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", agg->aggregator_identifier, agg->num_of_ports, agg->actor_oper_aggregator_key, agg->partner_oper_aggregator_key, agg->is_individual, agg->is_active); } /* check if any partner replies */ if (best->is_individual) net_warn_ratelimited("%s: Warning: No 802.3ad response from the link partner for any adapters in the bond\n", bond->dev->name); best->is_active = 1; netdev_dbg(bond->dev, "(slave %s): LAG %d chosen as the active LAG\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier); netdev_dbg(bond->dev, "(slave %s): Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); /* disable the ports that were related to the former * active_aggregator */ if (active) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __disable_port(port); } } /* Slave array needs update. */ *update_slave_arr = true; } /* if the selected aggregator is of join individuals * (partner_system is NULL), enable their ports */ active = __get_active_agg(origin); if (active) { if (!__agg_has_partner(active)) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __enable_port(port); } *update_slave_arr = true; } } rcu_read_unlock(); bond_3ad_set_carrier(bond); } /** * ad_clear_agg - clear a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_clear_agg(struct aggregator *aggregator) { if (aggregator) { aggregator->is_individual = false; aggregator->actor_admin_aggregator_key = 0; aggregator->actor_oper_aggregator_key = 0; eth_zero_addr(aggregator->partner_system.mac_addr_value); aggregator->partner_system_priority = 0; aggregator->partner_oper_aggregator_key = 0; aggregator->receive_state = 0; aggregator->transmit_state = 0; aggregator->lag_ports = NULL; aggregator->is_active = 0; aggregator->num_of_ports = 0; pr_debug("%s: LAG %d was cleared\n", aggregator->slave ? aggregator->slave->dev->name : "NULL", aggregator->aggregator_identifier); } } /** * ad_initialize_agg - initialize a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_initialize_agg(struct aggregator *aggregator) { if (aggregator) { ad_clear_agg(aggregator); eth_zero_addr(aggregator->aggregator_mac_address.mac_addr_value); aggregator->aggregator_identifier = 0; aggregator->slave = NULL; } } /** * ad_initialize_port - initialize a given port's parameters * @port: the port we're looking at * @lacp_fast: boolean. whether fast periodic should be used */ static void ad_initialize_port(struct port *port, int lacp_fast) { static const struct port_params tmpl = { .system_priority = 0xffff, .key = 1, .port_number = 1, .port_priority = 0xff, .port_state = 1, }; static const struct lacpdu lacpdu = { .subtype = 0x01, .version_number = 0x01, .tlv_type_actor_info = 0x01, .actor_information_length = 0x14, .tlv_type_partner_info = 0x02, .partner_information_length = 0x14, .tlv_type_collector_info = 0x03, .collector_information_length = 0x10, .collector_max_delay = htons(AD_COLLECTOR_MAX_DELAY), }; if (port) { port->actor_port_priority = 0xff; port->actor_port_aggregator_identifier = 0; port->ntt = false; port->actor_admin_port_state = LACP_STATE_AGGREGATION | LACP_STATE_LACP_ACTIVITY; port->actor_oper_port_state = LACP_STATE_AGGREGATION | LACP_STATE_LACP_ACTIVITY; if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; memcpy(&port->partner_admin, &tmpl, sizeof(tmpl)); memcpy(&port->partner_oper, &tmpl, sizeof(tmpl)); port->is_enabled = true; /* private parameters */ port->sm_vars = AD_PORT_BEGIN | AD_PORT_LACP_ENABLED; port->sm_rx_state = 0; port->sm_rx_timer_counter = 0; port->sm_periodic_state = 0; port->sm_periodic_timer_counter = 0; port->sm_mux_state = 0; port->sm_mux_timer_counter = 0; port->sm_tx_state = 0; port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->transaction_id = 0; port->sm_churn_actor_timer_counter = 0; port->sm_churn_actor_state = 0; port->churn_actor_count = 0; port->sm_churn_partner_timer_counter = 0; port->sm_churn_partner_state = 0; port->churn_partner_count = 0; memcpy(&port->lacpdu, &lacpdu, sizeof(lacpdu)); } } /** * ad_enable_collecting_distributing - enable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Enable @port if it's in an active aggregator */ static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator->is_active) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Enabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __enable_port(port); /* Slave array needs update */ *update_slave_arr = true; } } /** * ad_disable_collecting_distributing - disable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator && !MAC_ADDRESS_EQUAL(&(port->aggregator->partner_system), &(null_mac_addr))) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __disable_port(port); /* Slave array needs an update */ *update_slave_arr = true; } } /** * ad_marker_info_received - handle receive of a Marker information frame * @marker_info: Marker info received * @port: the port we're looking at */ static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port) { struct bond_marker marker; atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_rx); /* copy the received marker data to the response marker */ memcpy(&marker, marker_info, sizeof(struct bond_marker)); /* change the marker subtype to marker response */ marker.tlv_type = AD_MARKER_RESPONSE_SUBTYPE; /* send the marker response */ if (ad_marker_send(port, &marker) >= 0) slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent Marker Response on port %d\n", port->actor_port_number); } /** * ad_marker_response_received - handle receive of a marker response frame * @marker: marker PDU received * @port: the port we're looking at * * This function does nothing since we decided not to implement send and handle * response for marker PDU's, in this stage, but only to respond to marker * information. */ static void ad_marker_response_received(struct bond_marker *marker, struct port *port) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_resp_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_resp_rx); /* DO NOTHING, SINCE WE DECIDED NOT TO IMPLEMENT THIS FEATURE FOR NOW */ } /* ========= AD exported functions to the main bonding code ========= */ /* Check aggregators status in team every T seconds */ #define AD_AGGREGATOR_SELECTION_TIMER 8 /** * bond_3ad_initiate_agg_selection - initate aggregator selection * @bond: bonding struct * @timeout: timeout value to set * * Set the aggregation selection timer, to initiate an agg selection in * the very near future. Called during first initialization, and during * any down to up transitions of the bond. */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** * bond_3ad_initialize - initialize a bond's 802.3ad parameters and structures * @bond: bonding struct to work on * * Can be called only after the mac address of the bond is set. */ void bond_3ad_initialize(struct bonding *bond) { BOND_AD_INFO(bond).aggregator_identifier = 0; BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); bond_3ad_initiate_agg_selection(bond, AD_AGGREGATOR_SELECTION_TIMER * ad_ticks_per_sec); } /** * bond_3ad_bind_slave - initialize a slave's port * @slave: slave struct to work on * * Returns: 0 on success * < 0 on error */ void bond_3ad_bind_slave(struct slave *slave) { struct bonding *bond = bond_get_bond_by_slave(slave); struct port *port; struct aggregator *aggregator; /* check that the slave has not been initialized yet. */ if (SLAVE_AD_INFO(slave)->port.slave != slave) { /* port initialization */ port = &(SLAVE_AD_INFO(slave)->port); ad_initialize_port(port, bond->params.lacp_fast); port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and * user key */ port->actor_admin_port_key = bond->params.ad_user_port_key << 6; ad_update_actor_keys(port, false); /* actor system is the bond's system */ __ad_actor_update_port(port); /* tx timer(to verify that no more than MAX_TX_IN_SECOND * lacpdu's are sent in one second) */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; __disable_port(port); /* aggregator initialization */ aggregator = &(SLAVE_AD_INFO(slave)->aggregator); ad_initialize_agg(aggregator); aggregator->aggregator_mac_address = *((struct mac_addr *)bond->dev->dev_addr); aggregator->aggregator_identifier = ++BOND_AD_INFO(bond).aggregator_identifier; aggregator->slave = slave; aggregator->is_active = 0; aggregator->num_of_ports = 0; } } /** * bond_3ad_unbind_slave - deinitialize a slave's port * @slave: slave struct to work on * * Search for the aggregator that is related to this port, remove the * aggregator and assign another aggregator for other port related to it * (if any), and remove the port. */ void bond_3ad_unbind_slave(struct slave *slave) { struct port *port, *prev_port, *temp_port; struct aggregator *aggregator, *new_aggregator, *temp_aggregator; int select_new_active_agg = 0; struct bonding *bond = slave->bond; struct slave *slave_iter; struct list_head *iter; bool dummy_slave_update; /* Ignore this value as caller updates array */ /* Sync against bond_3ad_state_machine_handler() */ spin_lock_bh(&bond->mode_lock); aggregator = &(SLAVE_AD_INFO(slave)->aggregator); port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(bond->dev, slave->dev, "Trying to unbind an uninitialized port\n"); goto out; } slave_dbg(bond->dev, slave->dev, "Unbinding Link Aggregation Group %d\n", aggregator->aggregator_identifier); /* Tell the partner that this port is not suitable for aggregation */ port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->actor_oper_port_state &= ~LACP_STATE_AGGREGATION; __update_lacpdu_from_port(port); ad_lacpdu_send(port); /* check if this aggregator is occupied */ if (aggregator->lag_ports) { /* check if there are other ports related to this aggregator * except the port related to this slave(thats ensure us that * there is a reason to search for new aggregator, and that we * will find one */ if ((aggregator->lag_ports != port) || (aggregator->lag_ports->next_port_in_aggregator)) { /* find new aggregator for the related port(s) */ bond_for_each_slave(bond, slave_iter, iter) { new_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); /* if the new aggregator is empty, or it is * connected to our port only */ if (!new_aggregator->lag_ports || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator)) break; } if (!slave_iter) new_aggregator = NULL; /* if new aggregator found, copy the aggregator's * parameters and connect the related lag_ports to the * new aggregator */ if ((new_aggregator) && ((!new_aggregator->lag_ports) || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator))) { slave_dbg(bond->dev, slave->dev, "Some port(s) related to LAG %d - replacing with LAG %d\n", aggregator->aggregator_identifier, new_aggregator->aggregator_identifier); if ((new_aggregator->lag_ports == port) && new_aggregator->is_active) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); select_new_active_agg = 1; } new_aggregator->is_individual = aggregator->is_individual; new_aggregator->actor_admin_aggregator_key = aggregator->actor_admin_aggregator_key; new_aggregator->actor_oper_aggregator_key = aggregator->actor_oper_aggregator_key; new_aggregator->partner_system = aggregator->partner_system; new_aggregator->partner_system_priority = aggregator->partner_system_priority; new_aggregator->partner_oper_aggregator_key = aggregator->partner_oper_aggregator_key; new_aggregator->receive_state = aggregator->receive_state; new_aggregator->transmit_state = aggregator->transmit_state; new_aggregator->lag_ports = aggregator->lag_ports; new_aggregator->is_active = aggregator->is_active; new_aggregator->num_of_ports = aggregator->num_of_ports; /* update the information that is written on * the ports about the aggregator */ for (temp_port = aggregator->lag_ports; temp_port; temp_port = temp_port->next_port_in_aggregator) { temp_port->aggregator = new_aggregator; temp_port->actor_port_aggregator_identifier = new_aggregator->aggregator_identifier; } ad_clear_agg(aggregator); if (select_new_active_agg) ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } else { slave_warn(bond->dev, slave->dev, "unbinding aggregator, and could not find a new aggregator for its ports\n"); } } else { /* in case that the only port related to this * aggregator is the one we want to remove */ select_new_active_agg = aggregator->is_active; ad_clear_agg(aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ temp_aggregator = __get_first_agg(port); if (temp_aggregator) ad_agg_selection_logic(temp_aggregator, &dummy_slave_update); } } } slave_dbg(bond->dev, slave->dev, "Unbinding port %d\n", port->actor_port_number); /* find the aggregator that this port is connected to */ bond_for_each_slave(bond, slave_iter, iter) { temp_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); prev_port = NULL; /* search the port in the aggregator's related ports */ for (temp_port = temp_aggregator->lag_ports; temp_port; prev_port = temp_port, temp_port = temp_port->next_port_in_aggregator) { if (temp_port == port) { /* the aggregator found - detach the port from * this aggregator */ if (prev_port) prev_port->next_port_in_aggregator = temp_port->next_port_in_aggregator; else temp_aggregator->lag_ports = temp_port->next_port_in_aggregator; temp_aggregator->num_of_ports--; if (__agg_active_ports(temp_aggregator) == 0) { select_new_active_agg = temp_aggregator->is_active; if (temp_aggregator->num_of_ports == 0) ad_clear_agg(temp_aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } } break; } } } port->slave = NULL; out: spin_unlock_bh(&bond->mode_lock); } /** * bond_3ad_update_ad_actor_settings - reflect change of actor settings to ports * @bond: bonding struct to work on * * If an ad_actor setting gets changed we need to update the individual port * settings so the bond device will use the new values when it gets upped. */ void bond_3ad_update_ad_actor_settings(struct bonding *bond) { struct list_head *iter; struct slave *slave; ASSERT_RTNL(); BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { struct port *port = &(SLAVE_AD_INFO(slave))->port; __ad_actor_update_port(port); port->ntt = true; } spin_unlock_bh(&bond->mode_lock); } /** * bond_agg_timer_advance - advance agg_select_timer * @bond: bonding structure * * Return true when agg_select_timer reaches 0. */ static bool bond_agg_timer_advance(struct bonding *bond) { int val, nval; while (1) { val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); if (!val) return false; nval = val - 1; if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, val, nval) == val) break; } return nval == 0; } /** * bond_3ad_state_machine_handler - handle state machines timeout * @work: work context to fetch bonding struct to work on from * * The state machine handling concept in this module is to check every tick * which state machine should operate any function. The execution order is * round robin, so when we have an interaction between state machines, the * reply of one to each other might be delayed until next tick. * * This function also complete the initialization when the agg_select_timer * times out, and it selects an aggregator for the ports that are yet not * related to any aggregator, and selects the active aggregator for a bond. */ void bond_3ad_state_machine_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, ad_work.work); struct aggregator *aggregator; struct list_head *iter; struct slave *slave; struct port *port; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; bool update_slave_arr = false; /* Lock to protect data accessed by all (e.g., port->sm_vars) and * against running with bond_3ad_unbind_slave. ad_rx_machine may run * concurrently due to incoming LACPDU as well. */ spin_lock_bh(&bond->mode_lock); rcu_read_lock(); /* check if there are any slaves */ if (!bond_has_slaves(bond)) goto re_arm; if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; /* select the active aggregator for the bond */ if (port) { if (!port->slave) { net_warn_ratelimited("%s: Warning: bond's first port is uninitialized\n", bond->dev->name); goto re_arm; } aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, &update_slave_arr); } bond_3ad_set_carrier(bond); } /* for each port run the state machines */ bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: Found an uninitialized port\n", bond->dev->name); goto re_arm; } ad_rx_machine(NULL, port); ad_periodic_machine(port, &bond->params); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); ad_churn_machine(port); /* turn off the BEGIN bit, since we already handled it */ if (port->sm_vars & AD_PORT_BEGIN) port->sm_vars &= ~AD_PORT_BEGIN; } re_arm: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } rcu_read_unlock(); spin_unlock_bh(&bond->mode_lock); if (update_slave_arr) bond_slave_arr_work_rearm(bond, 0); if (should_notify_rtnl && rtnl_trylock()) { bond_slave_state_notify(bond); rtnl_unlock(); } queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks); } /** * bond_3ad_rx_indication - handle a received frame * @lacpdu: received lacpdu * @slave: slave struct to work on * * It is assumed that frames that were sent on this NIC don't returned as new * received frames (loopback). Since only the payload is given to this * function, it check for loopback. */ static int bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave) { struct bonding *bond = slave->bond; int ret = RX_HANDLER_ANOTHER; struct bond_marker *marker; struct port *port; atomic64_t *stat; port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: port of slave %s is uninitialized\n", slave->dev->name, slave->bond->dev->name); return ret; } switch (lacpdu->subtype) { case AD_TYPE_LACPDU: ret = RX_HANDLER_CONSUMED; slave_dbg(slave->bond->dev, slave->dev, "Received LACPDU on port %d\n", port->actor_port_number); /* Protect against concurrent state machines */ spin_lock(&slave->bond->mode_lock); ad_rx_machine(lacpdu, port); spin_unlock(&slave->bond->mode_lock); break; case AD_TYPE_MARKER: ret = RX_HANDLER_CONSUMED; /* No need to convert fields to Little Endian since we * don't use the marker's fields. */ marker = (struct bond_marker *)lacpdu; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Information on port %d\n", port->actor_port_number); ad_marker_info_received(marker, port); break; case AD_MARKER_RESPONSE_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Response on port %d\n", port->actor_port_number); ad_marker_response_received(marker, port); break; default: slave_dbg(slave->bond->dev, slave->dev, "Received an unknown Marker subtype on port %d\n", port->actor_port_number); stat = &SLAVE_AD_INFO(slave)->stats.marker_unknown_rx; atomic64_inc(stat); stat = &BOND_AD_INFO(bond).stats.marker_unknown_rx; atomic64_inc(stat); } break; default: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_unknown_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_unknown_rx); } return ret; } /** * ad_update_actor_keys - Update the oper / admin keys for a port based on * its current speed and duplex settings. * * @port: the port we'are looking at * @reset: Boolean to just reset the speed and the duplex part of the key * * The logic to change the oper / admin keys is: * (a) A full duplex port can participate in LACP with partner. * (b) When the speed is changed, LACP need to be reinitiated. */ static void ad_update_actor_keys(struct port *port, bool reset) { u8 duplex = 0; u16 ospeed = 0, speed = 0; u16 old_oper_key = port->actor_oper_port_key; port->actor_admin_port_key &= ~(AD_SPEED_KEY_MASKS|AD_DUPLEX_KEY_MASKS); if (!reset) { speed = __get_link_speed(port); ospeed = (old_oper_key & AD_SPEED_KEY_MASKS) >> 1; duplex = __get_duplex(port); port->actor_admin_port_key |= (speed << 1) | duplex; } port->actor_oper_port_key = port->actor_admin_port_key; if (old_oper_key != port->actor_oper_port_key) { /* Only 'duplex' port participates in LACP */ if (duplex) port->sm_vars |= AD_PORT_LACP_ENABLED; else port->sm_vars &= ~AD_PORT_LACP_ENABLED; if (!reset) { if (!speed) { slave_err(port->slave->bond->dev, port->slave->dev, "speed changed to 0 on port %d\n", port->actor_port_number); } else if (duplex && ospeed != speed) { /* Speed change restarts LACP state-machine */ port->sm_vars |= AD_PORT_BEGIN; } } } } /** * bond_3ad_adapter_speed_duplex_changed - handle a slave's speed / duplex * change indication * * @slave: slave struct to work on * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_adapter_speed_duplex_changed(struct slave *slave) { struct port *port; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "speed/duplex changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); ad_update_actor_keys(port, false); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed speed/duplex\n", port->actor_port_number); } /** * bond_3ad_handle_link_change - handle a slave's link status change indication * @slave: slave struct to work on * @link: whether the link is now up or down * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_handle_link_change(struct slave *slave, char link) { struct aggregator *agg; struct port *port; bool dummy; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "link status changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); /* on link down we are zeroing duplex and speed since * some of the adaptors(ce1000.lan) report full duplex/speed * instead of N/A(duplex) / 0(speed). * * on link up we are forcing recheck on the duplex and speed since * some of he adaptors(ce1000.lan) report. */ if (link == BOND_LINK_UP) { port->is_enabled = true; ad_update_actor_keys(port, false); } else { /* link has failed */ port->is_enabled = false; ad_update_actor_keys(port, true); } agg = __get_first_agg(port); ad_agg_selection_logic(agg, &dummy); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed link status to %s\n", port->actor_port_number, link == BOND_LINK_UP ? "UP" : "DOWN"); /* RTNL is held and mode_lock is released so it's safe * to update slave_array here. */ bond_update_slave_arr(slave->bond, NULL); } /** * bond_3ad_set_carrier - set link state for bonding master * @bond: bonding structure * * if we have an active aggregator, we're up, if not, we're down. * Presumes that we cannot have an active aggregator if there are * no slaves with link up. * * This behavior complies with IEEE 802.3 section 43.3.9. * * Called by bond_set_carrier(). Return zero if carrier state does not * change, nonzero if it does. */ int bond_3ad_set_carrier(struct bonding *bond) { struct aggregator *active; struct slave *first_slave; int ret = 1; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); if (!first_slave) { ret = 0; goto out; } active = __get_active_agg(&(SLAVE_AD_INFO(first_slave)->aggregator)); if (active) { /* are enough slaves available to consider link up? */ if (__agg_active_ports(active) < bond->params.min_links) { if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); goto out; } } else if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); goto out; } } else if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); } out: rcu_read_unlock(); return ret; } /** * __bond_3ad_get_active_agg_info - get information of the active aggregator * @bond: bonding struct to work on * @ad_info: ad_info struct to fill with the bond's info * * Returns: 0 on success * < 0 on error */ int __bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { struct aggregator *aggregator = NULL; struct list_head *iter; struct slave *slave; struct port *port; bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (port->aggregator && port->aggregator->is_active) { aggregator = port->aggregator; break; } } if (!aggregator) return -1; ad_info->aggregator_id = aggregator->aggregator_identifier; ad_info->ports = __agg_active_ports(aggregator); ad_info->actor_key = aggregator->actor_oper_aggregator_key; ad_info->partner_key = aggregator->partner_oper_aggregator_key; ether_addr_copy(ad_info->partner_system, aggregator->partner_system.mac_addr_value); return 0; } int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { int ret; rcu_read_lock(); ret = __bond_3ad_get_active_agg_info(bond, ad_info); rcu_read_unlock(); return ret; } int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct lacpdu *lacpdu, _lacpdu; if (skb->protocol != PKT_TYPE_LACPDU) return RX_HANDLER_ANOTHER; if (!MAC_ADDRESS_EQUAL(eth_hdr(skb)->h_dest, lacpdu_mcast_addr)) return RX_HANDLER_ANOTHER; lacpdu = skb_header_pointer(skb, 0, sizeof(_lacpdu), &_lacpdu); if (!lacpdu) { atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_illegal_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_illegal_rx); return RX_HANDLER_ANOTHER; } return bond_3ad_rx_indication(lacpdu, slave); } /** * bond_3ad_update_lacp_rate - change the lacp rate * @bond: bonding struct * * When modify lacp_rate parameter via sysfs, * update actor_oper_port_state of each port. * * Hold bond->mode_lock, * so we can modify port->actor_oper_port_state, * no matter bond is up or down. */ void bond_3ad_update_lacp_rate(struct bonding *bond) { struct port *port = NULL; struct list_head *iter; struct slave *slave; int lacp_fast; lacp_fast = bond->params.lacp_fast; spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; else port->actor_oper_port_state &= ~LACP_STATE_LACP_TIMEOUT; } spin_unlock_bh(&bond->mode_lock); } size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_UNKNOWN_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_ILLEGAL_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_TX */ nla_total_size_64bit(sizeof(u64)); /* BOND_3AD_STAT_MARKER_UNKNOWN_RX */ } int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats) { u64 val; val = atomic64_read(&stats->lacpdu_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_illegal_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_ILLEGAL_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; return 0; }
22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq #if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H #include <linux/tracepoint.h> struct irqaction; struct softirq_action; #define SOFTIRQ_NAME_LIST \ softirq_name(HI) \ softirq_name(TIMER) \ softirq_name(NET_TX) \ softirq_name(NET_RX) \ softirq_name(BLOCK) \ softirq_name(IRQ_POLL) \ softirq_name(TASKLET) \ softirq_name(SCHED) \ softirq_name(HRTIMER) \ softirq_name_end(RCU) #undef softirq_name #undef softirq_name_end #define softirq_name(sirq) TRACE_DEFINE_ENUM(sirq##_SOFTIRQ); #define softirq_name_end(sirq) TRACE_DEFINE_ENUM(sirq##_SOFTIRQ); SOFTIRQ_NAME_LIST #undef softirq_name #undef softirq_name_end #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }, #define softirq_name_end(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ __print_symbolic(val, SOFTIRQ_NAME_LIST) /** * irq_handler_entry - called immediately before the irq action handler * @irq: irq number * @action: pointer to struct irqaction * * The struct irqaction pointed to by @action contains various * information about the handler, including the device name, * @action->name, and the device id, @action->dev_id. When used in * conjunction with the irq_handler_exit tracepoint, we can figure * out irq handler latencies. */ TRACE_EVENT(irq_handler_entry, TP_PROTO(int irq, struct irqaction *action), TP_ARGS(irq, action), TP_STRUCT__entry( __field( int, irq ) __string( name, action->name ) ), TP_fast_assign( __entry->irq = irq; __assign_str(name, action->name); ), TP_printk("irq=%d name=%s", __entry->irq, __get_str(name)) ); /** * irq_handler_exit - called immediately after the irq action handler returns * @irq: irq number * @action: pointer to struct irqaction * @ret: return value * * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding * @action->handler successfully handled this irq. Otherwise, the irq might be * a shared irq line, or the irq was not handled successfully. Can be used in * conjunction with the irq_handler_entry to understand irq handler latencies. */ TRACE_EVENT(irq_handler_exit, TP_PROTO(int irq, struct irqaction *action, int ret), TP_ARGS(irq, action, ret), TP_STRUCT__entry( __field( int, irq ) __field( int, ret ) ), TP_fast_assign( __entry->irq = irq; __entry->ret = ret; ), TP_printk("irq=%d ret=%s", __entry->irq, __entry->ret ? "handled" : "unhandled") ); DECLARE_EVENT_CLASS(softirq, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr), TP_STRUCT__entry( __field( unsigned int, vec ) ), TP_fast_assign( __entry->vec = vec_nr; ), TP_printk("vec=%u [action=%s]", __entry->vec, show_softirq_name(__entry->vec)) ); /** * softirq_entry - called immediately before the softirq handler * @vec_nr: softirq vector number * * When used in combination with the softirq_exit tracepoint * we can determine the softirq handler routine. */ DEFINE_EVENT(softirq, softirq_entry, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); /** * softirq_exit - called immediately after the softirq handler returns * @vec_nr: softirq vector number * * When used in combination with the softirq_entry tracepoint * we can determine the softirq handler routine. */ DEFINE_EVENT(softirq, softirq_exit, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); /** * softirq_raise - called immediately when a softirq is raised * @vec_nr: softirq vector number * * When used in combination with the softirq_entry tracepoint * we can determine the softirq raise to run latency. */ DEFINE_EVENT(softirq, softirq_raise, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); DECLARE_EVENT_CLASS(tasklet, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func), TP_STRUCT__entry( __field( void *, tasklet) __field( void *, func) ), TP_fast_assign( __entry->tasklet = t; __entry->func = func; ), TP_printk("tasklet=%ps function=%ps", __entry->tasklet, __entry->func) ); /** * tasklet_entry - called immediately before the tasklet is run * @t: tasklet pointer * @func: tasklet callback or function being run * * Used to find individual tasklet execution time */ DEFINE_EVENT(tasklet, tasklet_entry, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func) ); /** * tasklet_exit - called immediately after the tasklet is run * @t: tasklet pointer * @func: tasklet callback or function being run * * Used to find individual tasklet execution time */ DEFINE_EVENT(tasklet, tasklet_exit, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func) ); #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
4 2667 2667 2667 2667 2667 2667 2667 2667 21 2667 2667 2650 2650 2667 2650 3 2667 2650 2667 2667 2667 2667 2765 2765 2765 2764 2738 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 // SPDX-License-Identifier: GPL-2.0 /* * blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/list_sort.h> #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-wbt.h" /* * Mark a hardware queue as needing a restart. */ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch) * in blk_mq_run_hw_queue(). Its pair is the barrier in * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART, * meantime new request added to hctx->dispatch is missed to check in * blk_mq_run_hw_queue(). */ smp_mb(); blk_mq_run_hw_queue(hctx, true); } static int sched_rq_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); return rqa->mq_hctx > rqb->mq_hctx; } static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) { struct blk_mq_hw_ctx *hctx = list_first_entry(rq_list, struct request, queuelist)->mq_hctx; struct request *rq; LIST_HEAD(hctx_list); unsigned int count = 0; list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { list_cut_before(&hctx_list, rq_list, &rq->queuelist); goto dispatch; } count++; } list_splice_tail_init(rq_list, &hctx_list); dispatch: return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; bool multi_hctxs = false, run_queue = false; bool dispatched = false, busy = false; unsigned int max_dispatch; LIST_HEAD(rq_list); int count = 0; if (hctx->dispatch_busy) max_dispatch = 1; else max_dispatch = hctx->queue->nr_requests; do { struct request *rq; int budget_token; if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; if (!list_empty_careful(&hctx->dispatch)) { busy = true; break; } budget_token = blk_mq_get_dispatch_budget(q); if (budget_token < 0) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the * same queue and if we didn't dispatch then there's * no guarantee anyone will kick the queue. Kick it * ourselves. */ run_queue = true; break; } blk_mq_set_rq_budget_token(rq, budget_token); /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ list_add_tail(&rq->queuelist, &rq_list); count++; if (rq->mq_hctx != hctx) multi_hctxs = true; /* * If we cannot get tag for the request, stop dequeueing * requests from the IO scheduler. We are unlikely to be able * to submit them anyway and it creates false impression for * scheduling heuristics that the device can take more IO. */ if (!blk_mq_get_driver_tag(rq)) break; } while (count < max_dispatch); if (!count) { if (run_queue) blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); } else if (multi_hctxs) { /* * Requests from different hctx may be dequeued from some * schedulers, such as bfq and deadline. * * Sort the requests in the list according to their hctx, * dispatch batching requests from same hctx at a time. */ list_sort(NULL, &rq_list, sched_rq_cmp); do { dispatched |= blk_mq_dispatch_hctx_list(&rq_list); } while (!list_empty(&rq_list)); } else { dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); } if (busy) return -EAGAIN; return !!dispatched; } static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { unsigned long end = jiffies + HZ; int ret; do { ret = __blk_mq_do_dispatch_sched(hctx); if (ret != 1) break; if (need_resched() || time_is_before_jiffies(end)) { blk_mq_delay_run_hw_queue(hctx, 0); break; } } while (1); return ret; } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { unsigned short idx = ctx->index_hw[hctx->type]; if (++idx == hctx->nr_ctx) idx = 0; return hctx->ctxs[idx]; } /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); int ret = 0; struct request *rq; do { int budget_token; if (!list_empty_careful(&hctx->dispatch)) { ret = -EAGAIN; break; } if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; budget_token = blk_mq_get_dispatch_budget(q); if (budget_token < 0) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the * same queue and if we didn't dispatch then there's * no guarantee anyone will kick the queue. Kick it * ourselves. */ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); break; } blk_mq_set_rq_budget_token(rq, budget_token); /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ list_add(&rq->queuelist, &rq_list); /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; } static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { bool need_dispatch = false; LIST_HEAD(rq_list); /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. */ if (!list_empty_careful(&hctx->dispatch)) { spin_lock(&hctx->lock); if (!list_empty(&hctx->dispatch)) list_splice_init(&hctx->dispatch, &rq_list); spin_unlock(&hctx->lock); } /* * Only ask the scheduler for requests, if we didn't have residual * requests from the dispatch list. This is to avoid the case where * we only ever dispatch a fraction of the requests available because * of low device queue depth. Once we pull requests out of the IO * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. * * We want to dispatch from the scheduler if there was nothing * on the dispatch list or we were able to dispatch from the * dispatch list. */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) return 0; need_dispatch = true; } else { need_dispatch = hctx->dispatch_busy; } if (hctx->queue->elevator) return blk_mq_do_dispatch_sched(hctx); /* dequeue request one by one from sw queue if queue is busy */ if (need_dispatch) return blk_mq_do_dispatch_ctx(hctx); blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(hctx, &rq_list, 0); return 0; } void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; /* RCU or SRCU read lock is needed before checking quiesced flag */ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; hctx->run++; /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. */ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) blk_mq_run_hw_queue(hctx, true); } } bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; bool ret = false; enum hctx_type type; if (e && e->type->ops.bio_merge) { ret = e->type->ops.bio_merge(q, bio, nr_segs); goto out_put; } ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || list_empty_careful(&ctx->rq_lists[type])) goto out_put; /* default per sw-queue merge */ spin_lock(&ctx->lock); /* * Reverse check our software queue for entries that we could * potentially merge with. Currently includes a hand-wavy stop * count of 8, to not spend too much time checking for merges. */ if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) ret = true; spin_unlock(&ctx->lock); out_put: return ret; } bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free) { return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { if (blk_mq_is_shared_tags(q->tag_set->flags)) { hctx->sched_tags = q->sched_shared_tags; return 0; } hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, q->nr_requests); if (!hctx->sched_tags) return -ENOMEM; return 0; } static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) { blk_mq_free_rq_map(queue->sched_shared_tags); queue->sched_shared_tags = NULL; } /* called in queue's release handler, tagset has gone away */ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { if (!blk_mq_is_shared_tags(flags)) blk_mq_free_rq_map(hctx->sched_tags); hctx->sched_tags = NULL; } } if (blk_mq_is_shared_tags(flags)) blk_mq_exit_sched_shared_tags(q); } static int blk_mq_init_sched_shared_tags(struct request_queue *queue) { struct blk_mq_tag_set *set = queue->tag_set; /* * Set initial depth at max so that we don't need to reallocate for * updating nr_requests. */ queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, MAX_SCHED_RQ); if (!queue->sched_shared_tags) return -ENOMEM; blk_mq_tag_update_sched_shared_tags(queue); return 0; } /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; unsigned long i; int ret; /* * Default to double of smaller one between hw queue_depth and 128, * since we don't split into sync/async like the old code did. * Additionally, this is a per-hw queue depth. */ q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, BLKDEV_DEFAULT_RQ); if (blk_mq_is_shared_tags(flags)) { ret = blk_mq_init_sched_shared_tags(q); if (ret) return ret; } queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); if (ret) goto err_free_map_and_rqs; } ret = e->ops.init_sched(q, e); if (ret) goto err_free_map_and_rqs; mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched(q); mutex_unlock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; } } mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched_hctx(q, hctx); mutex_unlock(&q->debugfs_mutex); } return 0; err_free_map_and_rqs: blk_mq_sched_free_rqs(q); blk_mq_sched_tags_teardown(q, flags); q->elevator = NULL; return ret; } /* * called in either blk_queue_cleanup or elevator_switch, tagset * is required for freeing requests */ void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; if (blk_mq_is_shared_tags(q->tag_set->flags)) { blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, BLK_MQ_NO_HCTX_IDX); } else { queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); } } } void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; unsigned long i; unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched_hctx(hctx); mutex_unlock(&q->debugfs_mutex); if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } flags = hctx->flags; } mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched(q); mutex_unlock(&q->debugfs_mutex); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); q->elevator = NULL; }
18 17 13 12 18 11 11 2 8 7 5 7 6 5 4 3 4 47 13 13 12 12 50 50 9 50 50 50 1 50 50 9 8 8 47 48 1 48 48 50 48 48 48 48 1 1 1 1 1 50 6 6 6 6 2 2 1 1 3 2 1 1 5 5 6 5 1 3 4 2 2 1 4 15 15 14 13 2 11 2 14 54 1 58 3 55 58 57 5 54 1 53 53 15 39 54 58 42 11 44 14 11 44 1 1 3 3 5 38 4 38 38 39 39 38 3 37 38 39 6 2 1 5 4 3 6 27 25 23 23 27 22 21 21 19 19 19 2 7 7 7 2 1 63 19 44 12 3 11 1 2 4 3 5 4 10 2 4 10 5 4 3 1 5 5 4 1 3 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/bpf.h> #include <linux/btf_ids.h> #include <linux/filter.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/net.h> #include <linux/workqueue.h> #include <linux/skmsg.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/sock_diag.h> #include <net/udp.h> struct bpf_stab { struct bpf_map map; struct sock **sks; struct sk_psock_progs progs; spinlock_t lock; }; #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; if (attr->max_entries == 0 || attr->key_size != 4 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); spin_lock_init(&stab->lock); stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } return &stab->map; } int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { u32 ufd = attr->target_fd; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); fdput(f); return ret; } int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { u32 ufd = attr->target_fd; struct bpf_prog *prog; struct bpf_map *map; struct fd f; int ret; if (attr->attach_flags || attr->replace_bpf_fd) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); prog = bpf_prog_get(attr->attach_bpf_fd); if (IS_ERR(prog)) { ret = PTR_ERR(prog); goto put_map; } if (prog->type != ptype) { ret = -EINVAL; goto put_prog; } ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); put_prog: bpf_prog_put(prog); put_map: fdput(f); return ret; } static void sock_map_sk_acquire(struct sock *sk) __acquires(&sk->sk_lock.slock) { lock_sock(sk); rcu_read_lock(); } static void sock_map_sk_release(struct sock *sk) __releases(&sk->sk_lock.slock) { rcu_read_unlock(); release_sock(sk); } static void sock_map_add_link(struct sk_psock *psock, struct sk_psock_link *link, struct bpf_map *map, void *link_raw) { link->link_raw = link_raw; link->map = map; spin_lock_bh(&psock->link_lock); list_add_tail(&link->list, &psock->link); spin_unlock_bh(&psock->link_lock); } static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { if (link->link_raw == link_raw) { struct bpf_map *map = link->map; struct sk_psock_progs *progs = sock_map_progs(map); if (psock->saved_data_ready && progs->stream_parser) strp_stop = true; if (psock->saved_data_ready && progs->stream_verdict) verdict_stop = true; if (psock->saved_data_ready && progs->skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); if (strp_stop) sk_psock_stop_strp(sk, psock); if (verdict_stop) sk_psock_stop_verdict(sk, psock); if (psock->psock_update_sk_prot) psock->psock_update_sk_prot(sk, psock, false); write_unlock_bh(&sk->sk_callback_lock); } } static void sock_map_unref(struct sock *sk, void *link_raw) { struct sk_psock *psock = sk_psock(sk); if (likely(psock)) { sock_map_del_link(sk, psock, link_raw); sk_psock_put(sk, psock); } } static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) { struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (psock) { if (sk->sk_prot->close != sock_map_close) { psock = ERR_PTR(-EBUSY); goto out; } if (!refcount_inc_not_zero(&psock->refcnt)) psock = ERR_PTR(-EBUSY); } out: rcu_read_unlock(); return psock; } static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); if (IS_ERR(stream_verdict)) return PTR_ERR(stream_verdict); } stream_parser = READ_ONCE(progs->stream_parser); if (stream_parser) { stream_parser = bpf_prog_inc_not_zero(stream_parser); if (IS_ERR(stream_parser)) { ret = PTR_ERR(stream_parser); goto out_put_stream_verdict; } } msg_parser = READ_ONCE(progs->msg_parser); if (msg_parser) { msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); goto out_put_stream_parser; } } skb_verdict = READ_ONCE(progs->skb_verdict); if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) { ret = PTR_ERR(skb_verdict); goto out_put_msg_parser; } } psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; } } else { psock = sk_psock_init(sk, map->numa_node); if (IS_ERR(psock)) { ret = PTR_ERR(psock); goto out_progs; } } if (msg_parser) psock_set_prog(&psock->progs.msg_parser, msg_parser); if (stream_parser) psock_set_prog(&psock->progs.stream_parser, stream_parser); if (stream_verdict) psock_set_prog(&psock->progs.stream_verdict, stream_verdict); if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); /* msg_* and stream_* programs references tracked in psock after this * point. Reference dec and cleanup will occur through psock destructor */ ret = sock_map_init_proto(sk, psock); if (ret < 0) { sk_psock_put(sk, psock); goto out; } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); goto out; } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: if (stream_parser) bpf_prog_put(stream_parser); out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); out: return ret; } static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; sk = xchg(psk, NULL); if (sk) { sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); sock_put(sk); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(stab->sks); bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs); } static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(key >= map->max_entries)) return NULL; return READ_ONCE(stab->sks[key]); } static void *sock_map_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_map_lookup_elem(map, *(u32 *)key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test, struct sock **psk) { struct sock *sk; int err = 0; spin_lock_bh(&stab->lock); sk = *psk; if (!sk_test || sk_test == sk) sk = xchg(psk, NULL); if (likely(sk)) sock_map_unref(sk, psk); else err = -EINVAL; spin_unlock_bh(&stab->lock); return err; } static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); __sock_map_delete(stab, sk, link_raw); } static long sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = *(u32 *)key; struct sock **psk; if (unlikely(i >= map->max_entries)) return -EINVAL; psk = &stab->sks[i]; return __sock_map_delete(stab, NULL, psk); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); u32 i = key ? *(u32 *)key : U32_MAX; u32 *key_next = next; if (i == stab->map.max_entries - 1) return -ENOENT; if (i >= stab->map.max_entries) *key_next = 0; else *key_next = i + 1; return 0; } static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct sk_psock_link *link; struct sk_psock *psock; struct sock *osk; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (unlikely(idx >= map->max_entries)) return -E2BIG; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); spin_lock_bh(&stab->lock); osk = stab->sks[idx]; if (osk && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!osk && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } sock_map_add_link(psock, link, map, &stab->sks[idx]); stab->sks[idx] = sk; if (osk) sock_map_unref(osk, &stab->sks[idx]); spin_unlock_bh(&stab->lock); return 0; out_unlock: spin_unlock_bh(&stab->lock); if (psock) sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops) { return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB; } static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return sk->sk_state != TCP_LISTEN; else return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags) { struct socket *sock; struct sock *sk; int ret; u64 ufd; if (map->value_size == sizeof(u64)) ufd = *(u64 *)value; else ufd = *(u32 *)value; if (ufd > S32_MAX) return -EINVAL; sock = sockfd_lookup(ufd, &ret); if (!sock) return ret; sk = sock->sk; if (!sk) { ret = -EINVAL; goto out; } if (!sock_map_sk_is_suitable(sk)) { ret = -EOPNOTSUPP; goto out; } sock_map_sk_acquire(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); sock_map_sk_release(sk); out: sockfd_put(sock); return ret; } static long sock_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct sock *sk = (struct sock *)value; int ret; if (unlikely(!sk || !sk_fullsock(sk))) return -EINVAL; if (!sock_map_sk_is_suitable(sk)) return -EOPNOTSUPP; local_bh_disable(); bh_lock_sock(sk); if (!sock_map_sk_state_allowed(sk)) ret = -EOPNOTSUPP; else if (map->map_type == BPF_MAP_TYPE_SOCKMAP) ret = sock_map_update_common(map, *(u32 *)key, sk, flags); else ret = sock_hash_update_common(map, key, sk, flags); bh_unlock_sock(sk); local_bh_enable(); return ret; } BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_map_update_common(map, *(u32 *)key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_map_update_proto = { .func = bpf_sock_map_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, struct bpf_map *, map, u32, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; u32 index; }; struct bpf_iter__sockmap { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct bpf_map *, map); __bpf_md_ptr(void *, key); __bpf_md_ptr(struct sock *, sk); }; DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, struct sock *sk) static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info) { if (unlikely(info->index >= info->map->max_entries)) return NULL; info->sk = __sock_map_lookup_elem(info->map, info->index); /* can't return sk directly, since that might be NULL */ return info; } static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_map_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_map_seq_stop */ rcu_read_lock(); return sock_map_seq_lookup_elem(info); } static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; ++*pos; ++info->index; return sock_map_seq_lookup_elem(info); } static int sock_map_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_map_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !v); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; ctx.sk = info->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_map_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_map_seq_show(seq, NULL); /* pairs with sock_map_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_map_seq_ops = { .start = sock_map_seq_start, .next = sock_map_seq_next, .stop = sock_map_seq_stop, .show = sock_map_seq_show, }; static int sock_map_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_map_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } static void sock_map_fini_seq_private(void *priv_data) { struct sock_map_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_stab); usage += (u64)map->max_entries * sizeof(struct sock *); return usage; } static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, .map_free = sock_map_free, .map_get_next_key = sock_map_get_next_key, .map_lookup_elem_sys_only = sock_map_lookup_sys, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_map_mem_usage, .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; struct bpf_shtab_elem { struct rcu_head rcu; u32 hash; struct sock *sk; struct hlist_node node; u8 key[]; }; struct bpf_shtab_bucket { struct hlist_head head; spinlock_t lock; }; struct bpf_shtab { struct bpf_map map; struct bpf_shtab_bucket *buckets; u32 buckets_num; u32 elem_size; struct sk_psock_progs progs; atomic_t count; }; static inline u32 sock_hash_bucket_hash(const void *key, u32 len) { return jhash(key, len, 0); } static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab, u32 hash) { return &htab->buckets[hash & (htab->buckets_num - 1)]; } static struct bpf_shtab_elem * sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { struct bpf_shtab_elem *elem; hlist_for_each_entry_rcu(elem, head, node) { if (elem->hash == hash && !memcmp(&elem->key, key, key_size)) return elem; } return NULL; } static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; WARN_ON_ONCE(!rcu_read_lock_held()); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); return elem ? elem->sk : NULL; } static void sock_hash_free_elem(struct bpf_shtab *htab, struct bpf_shtab_elem *elem) { atomic_dec(&htab->count); kfree_rcu(elem, rcu); } static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk, void *link_raw) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem_probe, *elem = link_raw; struct bpf_shtab_bucket *bucket; WARN_ON_ONCE(!rcu_read_lock_held()); bucket = sock_hash_select_bucket(htab, elem->hash); /* elem may be deleted in parallel from the map, but access here * is okay since it's going away only after RCU grace period. * However, we need to check whether it's still present. */ spin_lock_bh(&bucket->lock); elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash, elem->key, map->key_size); if (elem_probe && elem_probe == elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); } static long sock_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 hash, key_size = map->key_size; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; int ret = -ENOENT; hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); ret = 0; } spin_unlock_bh(&bucket->lock); return ret; } static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab, void *key, u32 key_size, u32 hash, struct sock *sk, struct bpf_shtab_elem *old) { struct bpf_shtab_elem *new; if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (!old) { atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } } new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); if (!new) { atomic_dec(&htab->count); return ERR_PTR(-ENOMEM); } memcpy(new->key, key, key_size); new->sk = sk; new->hash = hash; return new; } static int sock_hash_update_common(struct bpf_map *map, void *key, struct sock *sk, u64 flags) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u32 key_size = map->key_size, hash; struct bpf_shtab_elem *elem, *elem_new; struct bpf_shtab_bucket *bucket; struct sk_psock_link *link; struct sk_psock *psock; int ret; WARN_ON_ONCE(!rcu_read_lock_held()); if (unlikely(flags > BPF_EXIST)) return -EINVAL; link = sk_psock_init_link(); if (!link) return -ENOMEM; ret = sock_map_link(map, sk); if (ret < 0) goto out_free; psock = sk_psock(sk); WARN_ON_ONCE(!psock); hash = sock_hash_bucket_hash(key, key_size); bucket = sock_hash_select_bucket(htab, hash); spin_lock_bh(&bucket->lock); elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size); if (elem && flags == BPF_NOEXIST) { ret = -EEXIST; goto out_unlock; } else if (!elem && flags == BPF_EXIST) { ret = -ENOENT; goto out_unlock; } elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem); if (IS_ERR(elem_new)) { ret = PTR_ERR(elem_new); goto out_unlock; } sock_map_add_link(psock, link, map, elem_new); /* Add new element to the head of the list, so that * concurrent search will find it before old elem. */ hlist_add_head_rcu(&elem_new->node, &bucket->head); if (elem) { hlist_del_rcu(&elem->node); sock_map_unref(elem->sk, elem); sock_hash_free_elem(htab, elem); } spin_unlock_bh(&bucket->lock); return 0; out_unlock: spin_unlock_bh(&bucket->lock); sk_psock_put(sk, psock); out_free: sk_psock_free_link(link); return ret; } static int sock_hash_get_next_key(struct bpf_map *map, void *key, void *key_next) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_elem *elem, *elem_next; u32 hash, key_size = map->key_size; struct hlist_head *head; int i = 0; if (!key) goto find_first_elem; hash = sock_hash_bucket_hash(key, key_size); head = &sock_hash_select_bucket(htab, hash)->head; elem = sock_hash_lookup_elem_raw(head, hash, key, key_size); if (!elem) goto find_first_elem; elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } i = hash & (htab->buckets_num - 1); i++; find_first_elem: for (; i < htab->buckets_num; i++) { head = &sock_hash_select_bucket(htab, i)->head; elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)), struct bpf_shtab_elem, node); if (elem_next) { memcpy(key_next, elem_next->key, key_size); return 0; } } return -ENOENT; } static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) { struct bpf_shtab *htab; int i, err; if (attr->max_entries == 0 || attr->key_size == 0 || (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&htab->map, attr); htab->buckets_num = roundup_pow_of_two(htab->map.max_entries); htab->elem_size = sizeof(struct bpf_shtab_elem) + round_up(htab->map.key_size, 8); if (htab->buckets_num == 0 || htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) { err = -EINVAL; goto free_htab; } htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_shtab_bucket), htab->map.numa_node); if (!htab->buckets) { err = -ENOMEM; goto free_htab; } for (i = 0; i < htab->buckets_num; i++) { INIT_HLIST_HEAD(&htab->buckets[i].head); spin_lock_init(&htab->buckets[i].lock); } return &htab->map; free_htab: bpf_map_area_free(htab); return ERR_PTR(err); } static void sock_hash_free(struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); struct bpf_shtab_bucket *bucket; struct hlist_head unlink_list; struct bpf_shtab_elem *elem; struct hlist_node *node; int i; /* After the sync no updates or deletes will be in-flight so it * is safe to walk map and remove entries without risking a race * in EEXIST update case. */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); /* We are racing with sock_hash_delete_from_link to * enter the spin-lock critical section. Every socket on * the list is still linked to sockhash. Since link * exists, psock exists and holds a ref to socket. That * lets us to grab a socket ref too. */ spin_lock_bh(&bucket->lock); hlist_for_each_entry(elem, &bucket->head, node) sock_hold(elem->sk); hlist_move_list(&bucket->head, &unlink_list); spin_unlock_bh(&bucket->lock); /* Process removed entries out of atomic context to * block for socket lock before deleting the psock's * link to sockhash. */ hlist_for_each_entry_safe(elem, node, &unlink_list, node) { hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); sock_put(elem->sk); sock_hash_free_elem(htab, elem); } } /* wait for psock readers accessing its map link */ synchronize_rcu(); bpf_map_area_free(htab->buckets); bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) { struct sock *sk; if (map->value_size != sizeof(u64)) return ERR_PTR(-ENOSPC); sk = __sock_hash_lookup_elem(map, key); if (!sk) return ERR_PTR(-ENOENT); __sock_gen_cookie(sk); return &sk->sk_cookie; } static void *sock_hash_lookup(struct bpf_map *map, void *key) { struct sock *sk; sk = __sock_hash_lookup_elem(map, key); if (!sk) return NULL; if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; return sk; } static void sock_hash_release_progs(struct bpf_map *map) { psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs); } BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(sock_map_sk_is_suitable(sops->sk) && sock_map_op_okay(sops))) return sock_hash_update_common(map, key, sops->sk, flags); return -EOPNOTSUPP; } const struct bpf_func_proto bpf_sock_hash_update_proto = { .func = bpf_sock_hash_update, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, struct bpf_map *, map, void *, key, u64, flags) { struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; msg->flags = flags; msg->sk_redir = sk; return SK_PASS; } const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; struct sock_hash_seq_info { struct bpf_map *map; struct bpf_shtab *htab; u32 bucket_id; }; static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info, struct bpf_shtab_elem *prev_elem) { const struct bpf_shtab *htab = info->htab; struct bpf_shtab_bucket *bucket; struct bpf_shtab_elem *elem; struct hlist_node *node; /* try to find next elem in the same bucket */ if (prev_elem) { node = rcu_dereference(hlist_next_rcu(&prev_elem->node)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; /* no more elements, continue in the next bucket */ info->bucket_id++; } for (; info->bucket_id < htab->buckets_num; info->bucket_id++) { bucket = &htab->buckets[info->bucket_id]; node = rcu_dereference(hlist_first_rcu(&bucket->head)); elem = hlist_entry_safe(node, struct bpf_shtab_elem, node); if (elem) return elem; } return NULL; } static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { struct sock_hash_seq_info *info = seq->private; if (*pos == 0) ++*pos; /* pairs with sock_hash_seq_stop */ rcu_read_lock(); return sock_hash_seq_find_next(info, NULL); } static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; ++*pos; return sock_hash_seq_find_next(info, v); } static int sock_hash_seq_show(struct seq_file *seq, void *v) __must_hold(rcu) { struct sock_hash_seq_info *info = seq->private; struct bpf_iter__sockmap ctx = {}; struct bpf_shtab_elem *elem = v; struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, !elem); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (elem) { ctx.key = elem->key; ctx.sk = elem->sk; } return bpf_iter_run_prog(prog, &ctx); } static void sock_hash_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { if (!v) (void)sock_hash_seq_show(seq, NULL); /* pairs with sock_hash_seq_start */ rcu_read_unlock(); } static const struct seq_operations sock_hash_seq_ops = { .start = sock_hash_seq_start, .next = sock_hash_seq_next, .stop = sock_hash_seq_stop, .show = sock_hash_seq_show, }; static int sock_hash_init_seq_private(void *priv_data, struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } static void sock_hash_fini_seq_private(void *priv_data) { struct sock_hash_seq_info *info = priv_data; bpf_map_put_with_uref(info->map); } static u64 sock_hash_mem_usage(const struct bpf_map *map) { struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map); u64 usage = sizeof(*htab); usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket); usage += atomic_read(&htab->count) * (u64)htab->elem_size; return usage; } static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, .map_free = sock_hash_free, .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_lookup_elem = sock_hash_lookup, .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, .map_mem_usage = sock_hash_mem_usage, .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) { switch (map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return &container_of(map, struct bpf_stab, map)->progs; case BPF_MAP_TYPE_SOCKHASH: return &container_of(map, struct bpf_shtab, map)->progs; default: break; } return NULL; } static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: *pprog = &progs->msg_parser; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: *pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; *pprog = &progs->stream_verdict; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; *pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } return 0; } static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) { struct bpf_prog **pprog; int ret; ret = sock_map_prog_lookup(map, &pprog, which); if (ret) return ret; if (old) return psock_replace_prog(pprog, prog, old); psock_set_prog(pprog, prog); return 0; } int sock_map_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; struct bpf_prog **pprog; struct bpf_prog *prog; struct bpf_map *map; struct fd f; u32 id = 0; int ret; if (attr->query.query_flags) return -EINVAL; f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); rcu_read_lock(); ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); if (ret) goto end; prog = *pprog; prog_cnt = !prog ? 0 : 1; if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) goto end; /* we do not hold the refcnt, the bpf prog may be released * asynchronously and the id would be set to 0. */ id = data_race(prog->aux->id); if (id == 0) prog_cnt = 0; end: rcu_read_unlock(); if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) ret = -EFAULT; fdput(f); return ret; } static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { case BPF_MAP_TYPE_SOCKMAP: return sock_map_delete_from_link(link->map, sk, link->link_raw); case BPF_MAP_TYPE_SOCKHASH: return sock_hash_delete_from_link(link->map, sk, link->link_raw); default: break; } } static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock) { struct sk_psock_link *link; while ((link = sk_psock_link_pop(psock))) { sock_map_unlink(sk, link); sk_psock_free_link(link); } } void sock_map_unhash(struct sock *sk) { void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); } if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) return; if (saved_unhash) saved_unhash(sk); } EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_destroy(struct sock *sk) { void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); } if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) return; if (saved_destroy) saved_destroy(sk); } EXPORT_SYMBOL_GPL(sock_map_destroy); void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; lock_sock(sk); rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); saved_close = READ_ONCE(sk->sk_prot)->close; } else { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ if (WARN_ON_ONCE(saved_close == sock_map_close)) return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) { struct bpf_map *map; int err = -EINVAL; if (!linfo->map.map_fd) return -EBADF; map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) goto put_map; if (prog->aux->max_rdonly_access > map->key_size) { err = -EACCES; goto put_map; } aux->map = map; return 0; put_map: bpf_map_put_with_uref(map); return err; } static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux) { bpf_map_put_with_uref(aux->map); } static struct bpf_iter_reg sock_map_iter_reg = { .target = "sockmap", .attach_target = sock_map_iter_attach_target, .detach_target = sock_map_iter_detach_target, .show_fdinfo = bpf_iter_map_show_fdinfo, .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, }; static int __init bpf_sockmap_iter_init(void) { sock_map_iter_reg.ctx_arg_info[1].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK]; return bpf_iter_reg_target(&sock_map_iter_reg); } late_initcall(bpf_sockmap_iter_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 */ /* * Vxlan private header file * */ #ifndef _VXLAN_PRIVATE_H #define _VXLAN_PRIVATE_H #include <linux/rhashtable.h> extern unsigned int vxlan_net_id; extern const u8 all_zeros_mac[ETH_ALEN + 2]; extern const struct rhashtable_params vxlan_vni_rht_params; #define PORT_HASH_BITS 8 #define PORT_HASH_SIZE (1 << PORT_HASH_BITS) /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; struct hlist_head sock_list[PORT_HASH_SIZE]; spinlock_t sock_lock; struct notifier_block nexthop_notifier_block; }; /* Forwarding table entry */ struct vxlan_fdb { struct hlist_node hlist; /* linked list of entries */ struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; struct list_head remotes; u8 eth_addr[ETH_ALEN]; u16 state; /* see ndm_state */ __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; struct nexthop __rcu *nh; struct vxlan_dev __rcu *vdev; }; #define NTF_VXLAN_ADDED_BY_USER 0x100 /* Virtual Network hash table head */ static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) { return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; } /* Socket hash table head */ static inline struct hlist_head *vs_head(struct net *net, __be16 port) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; } /* First remote destination for a forwarding entry. * Guaranteed to be non-NULL because remotes are never deleted. */ static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) { if (rcu_access_pointer(fdb->nh)) return NULL; return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); } static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) { if (rcu_access_pointer(fdb->nh)) return NULL; return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) { if (a->sa.sa_family != b->sa.sa_family) return false; if (a->sa.sa_family == AF_INET6) return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); else return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; } static inline int vxlan_nla_get_addr(union vxlan_addr *ip, const struct nlattr *nla) { if (nla_len(nla) >= sizeof(struct in6_addr)) { ip->sin6.sin6_addr = nla_get_in6_addr(nla); ip->sa.sa_family = AF_INET6; return 0; } else if (nla_len(nla) >= sizeof(__be32)) { ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); ip->sa.sa_family = AF_INET; return 0; } else { return -EAFNOSUPPORT; } } static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr, const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr); else return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); } static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return ipv6_addr_is_multicast(&ip->sin6.sin6_addr); else return ipv4_is_multicast(ip->sin.sin_addr.s_addr); } #else /* !CONFIG_IPV6 */ static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) { return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; } static inline int vxlan_nla_get_addr(union vxlan_addr *ip, const struct nlattr *nla) { if (nla_len(nla) >= sizeof(struct in6_addr)) { return -EAFNOSUPPORT; } else if (nla_len(nla) >= sizeof(__be32)) { ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); ip->sa.sa_family = AF_INET; return 0; } else { return -EAFNOSUPPORT; } } static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr, const union vxlan_addr *ip) { return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); } static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip) { return ipv4_is_multicast(ip->sin.sin_addr.s_addr); } #endif static inline size_t vxlan_addr_size(const union vxlan_addr *ip) { if (ip->sa.sa_family == AF_INET6) return sizeof(struct in6_addr); else return sizeof(__be32); } static inline struct vxlan_vni_node * vxlan_vnifilter_lookup(struct vxlan_dev *vxlan, __be32 vni) { struct vxlan_vni_group *vg; vg = rcu_dereference_rtnl(vxlan->vnigrp); if (!vg) return NULL; return rhashtable_lookup_fast(&vg->vni_hash, &vni, vxlan_vni_rht_params); } /* vxlan_core.c */ int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, struct vxlan_fdb **fdb, struct netlink_ext_ack *extack); int __vxlan_fdb_delete(struct vxlan_dev *vxlan, const unsigned char *addr, union vxlan_addr ip, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify); u32 eth_vni_hash(const unsigned char *addr, __be32 vni); u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni); int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack); void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, __be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc); int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, struct vxlan_config *conf, __be32 vni); /* vxlan_vnifilter.c */ int vxlan_vnigroup_init(struct vxlan_dev *vxlan); void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan); void vxlan_vnifilter_init(void); void vxlan_vnifilter_uninit(void); void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni, struct vxlan_vni_node *vninode, int type, unsigned int len); void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan, struct vxlan_sock *vs, bool ipv6); void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan); int vxlan_vnilist_update_group(struct vxlan_dev *vxlan, union vxlan_addr *old_remote_ip, union vxlan_addr *new_remote_ip, struct netlink_ext_ack *extack); /* vxlan_multicast.c */ int vxlan_multicast_join(struct vxlan_dev *vxlan); int vxlan_multicast_leave(struct vxlan_dev *vxlan); bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev, __be32 vni, union vxlan_addr *rip, int rifindex); int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip, int rifindex); int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip, int rifindex); /* vxlan_mdb.c */ int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int vxlan_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int vxlan_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int vxlan_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack); struct vxlan_mdb_entry *vxlan_mdb_entry_skb_get(struct vxlan_dev *vxlan, struct sk_buff *skb, __be32 src_vni); netdev_tx_t vxlan_mdb_xmit(struct vxlan_dev *vxlan, const struct vxlan_mdb_entry *mdb_entry, struct sk_buff *skb); int vxlan_mdb_init(struct vxlan_dev *vxlan); void vxlan_mdb_fini(struct vxlan_dev *vxlan); #endif
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2021 NXP */ #include "netlink.h" #include "common.h" struct phc_vclocks_req_info { struct ethnl_req_info base; }; struct phc_vclocks_reply_data { struct ethnl_reply_data base; int num; int *index; }; #define PHC_VCLOCKS_REPDATA(__reply_base) \ container_of(__reply_base, struct phc_vclocks_reply_data, base) const struct nla_policy ethnl_phc_vclocks_get_policy[] = { [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->num = ethtool_get_phc_vclocks(dev, &data->index); ethnl_ops_complete(dev); return ret; } static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); int len = 0; if (data->num > 0) { len += nla_total_size(sizeof(u32)); len += nla_total_size(sizeof(s32) * data->num); } return len; } static int phc_vclocks_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); if (data->num <= 0) return 0; if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) || nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX, sizeof(s32) * data->num, data->index)) return -EMSGSIZE; return 0; } static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); kfree(data->index); } const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = { .request_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, .reply_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, .hdr_attr = ETHTOOL_A_PHC_VCLOCKS_HEADER, .req_info_size = sizeof(struct phc_vclocks_req_info), .reply_data_size = sizeof(struct phc_vclocks_reply_data), .prepare_data = phc_vclocks_prepare_data, .reply_size = phc_vclocks_reply_size, .fill_reply = phc_vclocks_fill_reply, .cleanup_data = phc_vclocks_cleanup_data, };
81 12 140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; timer-related code * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/times.h> #include "br_private.h" #include "br_private_stp.h" /* called under bridge lock */ static int br_is_designated_for_some_port(const struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && !memcmp(&p->designated_bridge, &br->bridge_id, 8)) return 1; } return 0; } static void br_hello_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, hello_timer); br_debug(br, "hello timer expired\n"); spin_lock(&br->lock); if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); } spin_unlock(&br->lock); } static void br_message_age_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, message_age_timer); struct net_bridge *br = p->br; const bridge_id *id = &p->designated_bridge; int was_root; if (p->state == BR_STATE_DISABLED) return; br_info(br, "port %u(%s) neighbor %.2x%.2x.%pM lost\n", (unsigned int) p->port_no, p->dev->name, id->prio[0], id->prio[1], &id->addr); /* * According to the spec, the message age timer cannot be * running when we are the root bridge. So.. this was_root * check is redundant. I'm leaving it in for now, though. */ spin_lock(&br->lock); if (p->state == BR_STATE_DISABLED) goto unlock; was_root = br_is_root_bridge(br); br_become_designated_port(p); br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !was_root) br_become_root_bridge(br); unlock: spin_unlock(&br->lock); } static void br_forward_delay_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, forward_delay_timer); struct net_bridge *br = p->br; br_debug(br, "port %u(%s) forward delay timer\n", (unsigned int) p->port_no, p->dev->name); spin_lock(&br->lock); if (p->state == BR_STATE_LISTENING) { br_set_state(p, BR_STATE_LEARNING); mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } else if (p->state == BR_STATE_LEARNING) { br_set_state(p, BR_STATE_FORWARDING); if (br_is_designated_for_some_port(br)) br_topology_change_detection(br); netif_carrier_on(br->dev); } rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, NULL, p); rcu_read_unlock(); spin_unlock(&br->lock); } static void br_tcn_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, tcn_timer); br_debug(br, "tcn timer expired\n"); spin_lock(&br->lock); if (!br_is_root_bridge(br) && (br->dev->flags & IFF_UP)) { br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } spin_unlock(&br->lock); } static void br_topology_change_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, topology_change_timer); br_debug(br, "topo change timer expired\n"); spin_lock(&br->lock); br->topology_change_detected = 0; __br_set_topology_change(br, 0); spin_unlock(&br->lock); } static void br_hold_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, hold_timer); br_debug(p->br, "port %u(%s) hold timer expired\n", (unsigned int) p->port_no, p->dev->name); spin_lock(&p->br->lock); if (p->config_pending) br_transmit_config(p); spin_unlock(&p->br->lock); } void br_stp_timer_init(struct net_bridge *br) { timer_setup(&br->hello_timer, br_hello_timer_expired, 0); timer_setup(&br->tcn_timer, br_tcn_timer_expired, 0); timer_setup(&br->topology_change_timer, br_topology_change_timer_expired, 0); } void br_stp_port_timer_init(struct net_bridge_port *p) { timer_setup(&p->message_age_timer, br_message_age_timer_expired, 0); timer_setup(&p->forward_delay_timer, br_forward_delay_timer_expired, 0); timer_setup(&p->hold_timer, br_hold_timer_expired, 0); } /* Report ticks left (in USER_HZ) used for API */ unsigned long br_timer_value(const struct timer_list *timer) { return timer_pending(timer) ? jiffies_delta_to_clock_t(timer->expires - jiffies) : 0; }
3260 3260 3259 221 221 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 // SPDX-License-Identifier: GPL-2.0-only /* * fs/crypto/hooks.c * * Encryption hooks for higher-level filesystem operations. */ #include "fscrypt_private.h" /** * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * * Currently, an encrypted regular file can only be opened if its encryption key * is available; access to the raw encrypted contents is not supported. * Therefore, we first set up the inode's encryption key (if not already done) * and return an error if it's unavailable. * * We also verify that if the parent directory (from the path via which the file * is being opened) is encrypted, then the inode being opened uses the same * encryption policy. This is needed as part of the enforcement that all files * in an encrypted directory tree use the same encryption policy, as a * protection against certain types of offline attacks. Note that this check is * needed even when opening an *unencrypted* file, since it's forbidden to have * an unencrypted file in an encrypted directory. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code */ int fscrypt_file_open(struct inode *inode, struct file *filp) { int err; struct dentry *dir; err = fscrypt_require_key(inode); if (err) return err; dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { fscrypt_warn(inode, "Inconsistent encryption context (parent directory: %lu)", d_inode(dir)->i_ino); err = -EPERM; } dput(dir); return err; } EXPORT_SYMBOL_GPL(fscrypt_file_open); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { if (fscrypt_is_nokey_name(dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inode's key is * available, as it's implied by the dentry not being a no-key name. */ if (!fscrypt_has_permitted_context(dir, inode)) return -EXDEV; return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_link); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (fscrypt_is_nokey_name(old_dentry) || fscrypt_is_nokey_name(new_dentry)) return -ENOKEY; /* * We don't need to separately check that the directory inodes' keys are * available, as it's implied by the dentries not being no-key names. */ if (old_dir != new_dir) { if (IS_ENCRYPTED(new_dir) && !fscrypt_has_permitted_context(new_dir, d_inode(old_dentry))) return -EXDEV; if ((flags & RENAME_EXCHANGE) && IS_ENCRYPTED(old_dir) && !fscrypt_has_permitted_context(old_dir, d_inode(new_dentry))) return -EXDEV; } return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { int err = fscrypt_setup_filename(dir, &dentry->d_name, 1, fname); if (err && err != -ENOENT) return err; if (fname->is_nokey_name) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); } return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); /** * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup * @dir: the encrypted directory being searched * @dentry: the dentry being looked up in @dir * * This function should be used by the ->lookup and ->atomic_open methods of * filesystems that handle filename encryption and no-key name encoding * themselves and thus can't use fscrypt_prepare_lookup(). Like * fscrypt_prepare_lookup(), this will try to set up the directory's encryption * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable. * However, this function doesn't set up a struct fscrypt_name for the filename. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { int err = fscrypt_get_encryption_info(dir, true); if (!err && !fscrypt_has_encryption_key(dir)) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); } return err; } EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); int __fscrypt_prepare_readdir(struct inode *dir) { return fscrypt_get_encryption_info(dir, true); } EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (attr->ia_valid & ATTR_SIZE) return fscrypt_require_key(d_inode(dentry)); return 0; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr); /** * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS * @inode: the inode on which flags are being changed * @oldflags: the old flags * @flags: the new flags * * The caller should be holding i_rwsem for write. * * Return: 0 on success; -errno if the flags change isn't allowed or if * another error occurs. */ int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { struct fscrypt_inode_info *ci; struct fscrypt_master_key *mk; int err; /* * When the CASEFOLD flag is set on an encrypted directory, we must * derive the secret key needed for the dirhash. This is only possible * if the directory uses a v2 encryption policy. */ if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) { err = fscrypt_require_key(inode); if (err) return err; ci = inode->i_crypt_info; if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); if (mk->mk_present) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; up_read(&mk->mk_sem); return err; } return 0; } /** * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink * @dir: directory in which the symlink is being created * @target: plaintext symlink target * @len: length of @target excluding null terminator * @max_len: space the filesystem has available to store the symlink target * @disk_link: (out) the on-disk symlink target being prepared * * This function computes the size the symlink target will require on-disk, * stores it in @disk_link->len, and validates it against @max_len. An * encrypted symlink may be longer than the original. * * Additionally, @disk_link->name is set to @target if the symlink will be * unencrypted, but left NULL if the symlink will be encrypted. For encrypted * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the * on-disk target later. (The reason for the two-step process is that some * filesystems need to know the size of the symlink target before creating the * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.) * * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long, * -ENOKEY if the encryption key is missing, or another -errno code if a problem * occurred while setting up the encryption key. */ int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { const union fscrypt_policy *policy; /* * To calculate the size of the encrypted symlink target we need to know * the amount of NUL padding, which is determined by the flags set in * the encryption policy which will be inherited from the directory. */ policy = fscrypt_policy_to_inherit(dir); if (policy == NULL) { /* Not encrypted */ disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } if (IS_ERR(policy)) return PTR_ERR(policy); /* * Calculate the size of the encrypted symlink and verify it won't * exceed max_len. Note that for historical reasons, encrypted symlink * targets are prefixed with the ciphertext length, despite this * actually being redundant with i_size. This decreases by 2 bytes the * longest symlink target we can accept. * * We could recover 1 byte by not counting a null terminator, but * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ if (!__fscrypt_fname_encrypted_size(policy, len, max_len - sizeof(struct fscrypt_symlink_data) - 1, &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data) + 1; disk_link->name = NULL; return 0; } EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { int err; struct qstr iname = QSTR_INIT(target, len); struct fscrypt_symlink_data *sd; unsigned int ciphertext_len; /* * fscrypt_prepare_new_inode() should have already set up the new * symlink inode's encryption key. We don't wait until now to do it, * since we may be in a filesystem transaction now. */ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode))) return -ENOKEY; if (disk_link->name) { /* filesystem-provided buffer */ sd = (struct fscrypt_symlink_data *)disk_link->name; } else { sd = kmalloc(disk_link->len, GFP_NOFS); if (!sd) return -ENOMEM; } ciphertext_len = disk_link->len - sizeof(*sd) - 1; sd->len = cpu_to_le16(ciphertext_len); err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); if (err) goto err_free_sd; /* * Null-terminating the ciphertext doesn't make sense, but we still * count the null terminator in the length, so we might as well * initialize it just in case the filesystem writes it out. */ sd->encrypted_path[ciphertext_len] = '\0'; /* Cache the plaintext symlink target for later use by get_link() */ err = -ENOMEM; inode->i_link = kmemdup(target, len + 1, GFP_NOFS); if (!inode->i_link) goto err_free_sd; if (!disk_link->name) disk_link->name = (unsigned char *)sd; return 0; err_free_sd: if (!disk_link->name) kfree(sd); return err; } EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer * @done: if successful, will be set up to free the returned target if needed * * If the symlink's encryption key is available, we decrypt its target. * Otherwise, we encode its target for presentation. * * This may sleep, so the filesystem must have dropped out of RCU mode already. * * Return: the presentable symlink target or an ERR_PTR() */ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { const struct fscrypt_symlink_data *sd; struct fscrypt_str cstr, pstr; bool has_key; int err; /* This is for encrypted symlinks only */ if (WARN_ON_ONCE(!IS_ENCRYPTED(inode))) return ERR_PTR(-EINVAL); /* If the decrypted target is already cached, just return it. */ pstr.name = READ_ONCE(inode->i_link); if (pstr.name) return pstr.name; /* * Try to set up the symlink's encryption key, but we can continue * regardless of whether the key is available or not. */ err = fscrypt_get_encryption_info(inode, false); if (err) return ERR_PTR(err); has_key = fscrypt_has_encryption_key(inode); /* * For historical reasons, encrypted symlink targets are prefixed with * the ciphertext length, even though this is redundant with i_size. */ if (max_size < sizeof(*sd) + 1) return ERR_PTR(-EUCLEAN); sd = caddr; cstr.name = (unsigned char *)sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); if (cstr.len == 0) return ERR_PTR(-EUCLEAN); if (cstr.len + sizeof(*sd) > max_size) return ERR_PTR(-EUCLEAN); err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); if (err) return ERR_PTR(err); err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (err) goto err_kfree; err = -EUCLEAN; if (pstr.name[0] == '\0') goto err_kfree; pstr.name[pstr.len] = '\0'; /* * Cache decrypted symlink targets in i_link for later use. Don't cache * symlink targets encoded without the key, since those become outdated * once the key is added. This pairs with the READ_ONCE() above and in * the VFS path lookup code. */ if (!has_key || cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL) set_delayed_call(done, kfree_link, pstr.name); return pstr.name; err_kfree: kfree(pstr.name); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(fscrypt_get_symlink); /** * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks * @path: the path for the encrypted symlink being queried * @stat: the struct being filled with the symlink's attributes * * Override st_size of encrypted symlinks to be the length of the decrypted * symlink target (or the no-key encoded symlink target, if the key is * unavailable) rather than the length of the encrypted symlink target. This is * necessary for st_size to match the symlink target that userspace actually * sees. POSIX requires this, and some userspace programs depend on it. * * This requires reading the symlink target from disk if needed, setting up the * inode's encryption key if possible, and then decrypting or encoding the * symlink target. This makes lstat() more heavyweight than is normally the * case. However, decrypted symlink targets will be cached in ->i_link, so * usually the symlink won't have to be read and decrypted again later if/when * it is actually followed, readlink() is called, or lstat() is called again. * * Return: 0 on success, -errno on failure */ int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); const char *link; DEFINE_DELAYED_CALL(done); /* * To get the symlink target that userspace will see (whether it's the * decrypted target or the no-key encoded target), we can just get it in * the same way the VFS does during path resolution and readlink(). */ link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } stat->size = strlen(link); do_delayed_call(&done); return 0; } EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr);
3652 917 2469 14938 15895 16499 16502 15894 15896 5899 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/local_lock.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER #define MAX_ORDER 10 #else #define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif #define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } /* * Check whether a migratetype can be merged with another migratetype. * * It is only mergeable when it can fall back to other migratetypes for * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. */ static inline bool migratetype_is_mergeable(int mt) { return mt < MIGRATE_PCPTYPES; } #define for_each_migratetype_order(order, type) \ for (order = 0; order <= MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) #define get_pageblock_migratetype(page) \ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) #define folio_migratetype(folio) \ get_pfnblock_flags_mask(&folio->page, folio_pfn(folio), \ MIGRATETYPE_MASK) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; struct pglist_data; #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_EVENT_ITEMS }; #else #define NR_VM_NUMA_EVENT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, #ifdef CONFIG_UNACCEPTED_MEMORY NR_UNACCEPTED, #endif NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */ #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ #endif NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the item should be printed in THPs (/proc/vmstat * currently prints number of anon, file and shmem THPs. But the item * is charged in pages). */ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return false; return item == NR_ANON_THPS || item == NR_FILE_THPS || item == NR_SHMEM_THPS || item == NR_SHMEM_PMDMAPPED || item == NR_FILE_PMDMAPPED; } /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_NOPROGRESS, VMSCAN_THROTTLE_CONGESTED, NR_VMSCAN_THROTTLE, }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define WORKINGSET_ANON 0 #define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { /* * An lruvec has many dirty pages backed by a congested BDI: * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim. * It can be cleared by cgroup reclaim or kswapd. * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim. * It can only be cleared by kswapd. * * Essentially, kswapd can unthrottle an lruvec throttled by cgroup * reclaim, but not vice versa. This only applies to the root cgroup. * The goal is to prevent cgroup reclaim on the root cgroup (e.g. * memory.reclaim) to unthrottle an unbalanced node (that was throttled * by kswapd). */ LRUVEC_CGROUP_CONGESTED, LRUVEC_NODE_CONGESTED, }; #endif /* !__GENERATING_BOUNDS_H */ /* * Evictable pages are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the * eviction. The first check takes care of the accessed bit set on the initial * fault; the second check makes sure this page hasn't been used since then. * This process, AKA second chance, requires a minimum of two generations, * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive * LRU, e.g., /proc/vmstat, these two generations are considered active; the * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * * PG_active is always cleared while a page is on one of lrugen->folios[] so * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits * in folio->flags. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* * Each generation is divided into multiple tiers. A page accessed N times * through file descriptors is in tier order_base_2(N). A page in the first tier * (N=0,1) is marked by PG_referenced unless it was faulted in through page * tables or read ahead. A page in any other tier (N>1) is marked by * PG_referenced and PG_workingset. This implies a minimum of two tiers is * supported without using additional bits in folio->flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, * comparisons of refaulted/(evicted+protected) from the first tier and the * rest infer whether pages accessed multiple times through file descriptors * are statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in * folio->flags. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H struct lruvec; struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) #ifdef CONFIG_LRU_GEN enum { LRU_GEN_ANON, LRU_GEN_FILE, }; enum { LRU_GEN_CORE, LRU_GEN_MM_WALK, LRU_GEN_NONLEAF_YOUNG, NR_LRU_GEN_CAPS }; #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) /* whether to keep historical stats from evicted generations */ #ifdef CONFIG_LRU_GEN_STATS #define NR_HIST_GENS MAX_NR_GENS #else #define NR_HIST_GENS 1U #endif /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are * stored in min_seq[] separately for anon and file types as clean file pages * can be evicted regardless of swap constraints. * * Normally anon and file min_seq are in sync. But if swapping is constrained, * e.g., out of swap space, file min_seq is allowed to advance and leave anon * min_seq behind. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ unsigned long min_seq[ANON_AND_FILE]; /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; /* the first tier doesn't need protection, hence the minus one */ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; #ifdef CONFIG_MEMCG /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; #endif }; enum { MM_LEAF_TOTAL, /* total leaf entries */ MM_LEAF_OLD, /* old leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ MM_NONLEAF_TOTAL, /* total non-leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS }; /* double-buffering Bloom filters */ #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { /* set to max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; /* where the last iteration ended before */ struct list_head *tail; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; }; struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; /* unstable max_seq from lru_gen_folio */ unsigned long max_seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* to batch the mm stats */ int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; bool can_swap; bool force_scan; }; void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins * to improve scalability. For each bin, the hlist_nulls is virtually divided * into three segments: the head, the tail and the default. * * An onlining memcg is added to the tail of a random bin in the old generation. * The eviction starts at the head of a random bin in the old generation. The * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes * the old generation, is incremented when all its bins become empty. * * There are four operations: * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; * 2. The first attempt to reclaim an memcg below low, which triggers * MEMCG_LRU_TAIL; * 3. The first attempt to reclaim an memcg below reclaimable size threshold, * which triggers MEMCG_LRU_TAIL; * 4. The second attempt to reclaim an memcg below reclaimable size threshold, * which triggers MEMCG_LRU_YOUNG; * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. * * Note that memcg LRU only applies to global reclaim, and the round-robin * incrementing of their max_seq counters ensures the eventual fairness to all * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). */ #define MEMCG_NR_GENS 2 #define MEMCG_NR_BINS 8 struct lru_gen_memcg { /* the per-node memcg generation counter */ unsigned long seq; /* each memcg has one lru_gen_folio per node */ unsigned long nr_memcgs[MEMCG_NR_GENS]; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; /* protects the above */ spinlock_t lock; }; void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); #else /* !CONFIG_MEMCG */ #define MEMCG_NR_GENS 1 struct lru_gen_memcg { }; static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) { } #endif /* CONFIG_MEMCG */ #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) { } static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { } #ifdef CONFIG_MEMCG static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } #endif /* CONFIG_MEMCG */ #endif /* CONFIG_LRU_GEN */ struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif }; /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, WMARK_PROMO, NR_WMARK }; /* * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list * for THP which will usually be GFP_MOVABLE. Even if it is another type, * it should not contribute to serious fragmentation causing THP allocation * failures. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 1 #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) /* * Flags used in pcp->flags field. * * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the * previous page freeing. To avoid to drain PCP for an accident * high-order page freeing. * * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before * draining PCP for consecutive high-order pages freeing without * allocation if data cache slice of CPU is large enough. To reduce * zone lock contention and keep cache-hot pages reusing. */ #define PCPF_PREV_FREE_HIGH_ORDER BIT(0) #define PCPF_FREE_HIGH_BATCH BIT(1) struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int high_min; /* min high watermark */ int high_max; /* max high watermark */ int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ u8 alloc_factor; /* batch scaling factor during allocate */ #ifdef CONFIG_NUMA u8 expire; /* When 0, remote pagesets are drained */ #endif short free_count; /* consecutive free count */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; } ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; s8 stat_threshold; #endif #ifdef CONFIG_NUMA /* * Low priority inaccurate counters that are only folded * on demand. Use a large type to avoid the overhead of * folding during refresh_cpu_vm_stats. */ unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Therefore, we do not allow * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and * faulted, they come from the right zone right away. However, it is * still possible that address space already has pages in * ZONE_MOVABLE at the time when pages are pinned (i.e. user has * touches that memory before pinning). In such case we migrate them * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. * 7. Memory-hotplug: when using memmap_on_memory and onlining the * memory to the MOVABLE zone, the vmemmap pages are also placed in * such zone. Such pages cannot be really moved around as they are * self-stored in the range, but they are treated as movable when * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pages __percpu *per_cpu_pageset; struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access */ int pageset_high_min; int pageset_high_max; int pageset_batch; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * present_early_pages is present pages existing within the zone * located on memory available since early boot, excluding hotplugged * memory. * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * cma pages is present pages that are assigned for CMA use * (MIGRATE_CMA). * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/done(). Any reader who can't tolerant drift of * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; #if defined(CONFIG_MEMORY_HOTPLUG) unsigned long present_early_pages; #endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[MAX_ORDER + 1]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_ORDER */ struct list_head unaccepted_pages; #endif /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_cma_pages(struct zone *zone) { #ifdef CONFIG_CMA return zone->cma_pages; #else return 0; #endif } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) { return zone->spanned_pages == 0; } #ifndef BUILD_VDSO32_64 /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) #define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) #define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) #define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ SECTIONS_PGOFF : ZONES_PGOFF) #else #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ NODES_PGOFF : ZONES_PGOFF) #endif #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) { ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type folio_zonenum(const struct folio *folio) { return page_zonenum(&folio->page); } #ifdef CONFIG_ZONE_DEVICE static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different * pgmaps. Otherwise getting the pgmap of a given segment is not possible * without scanning the entire segment. This helper returns true either if * both pages are not zone device pages or both pages are zone device pages * with the same pgmap. */ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { if (is_zone_device_page(a) != is_zone_device_page(b)) return false; if (!is_zone_device_page(a)) return true; return a->pgmap == b->pgmap; } extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { return false; } static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { return true; } #endif static inline bool folio_is_zone_device(const struct folio *folio) { return is_zone_device_page(&folio->page); } static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; } static inline bool folio_is_zone_movable(const struct folio *folio) { return folio_zonenum(folio) == ZONE_MOVABLE; } #endif /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; /* * The array of struct pages for flatmem. * It must be declared for SPARSEMEM as well because there are configurations * that rely on that. */ extern struct page *mem_map; #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. */ struct memory_failure_stats { /* * Number of raw pages poisoned. * Cases not accounted: memory outside kernel control, offline page, * arch-specific memory_failure (SGX), hwpoison_filter() filtered * error events, and unpoison actions from hwpoison_unpoison. */ unsigned long total; /* * Recovery results of poisoned raw pages handled by memory_failure, * in sync with mf_result. * total = ignored + failed + delayed + recovered. * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. */ unsigned long ignored; unsigned long failed; unsigned long delayed; unsigned long recovered; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; /* workqueues for throttling reclaim for different reasons. */ wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ #ifdef CONFIG_MEMORY_HOTPLUG struct mutex kswapd_lock; #endif struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; /* number of promote candidate pages at start time of current rate limit period */ unsigned long nbp_rl_nr_cand; /* promote threshold in ms */ unsigned int nbp_threshold; /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif #ifdef CONFIG_MEMORY_FAILURE struct memory_failure_stats mf_stats; #endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; } #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. */ enum meminit_context { MEMINIT_EARLY, MEMINIT_HOTPLUG, }; extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG return lruvec->pgdat; #else return container_of(lruvec, struct pglist_data, __lruvec); #endif } #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else static inline int local_memory_node(int node_id) { return node_id; }; #endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE static inline bool zone_is_zone_device(struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else static inline bool zone_is_zone_device(struct zone *zone) { return false; } #endif /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ static inline bool managed_zone(struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ static inline bool populated_zone(struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA static inline int zone_to_nid(struct zone *zone) { return zone->node; } static inline void zone_set_nid(struct zone *zone, int nid) { zone->node = nid; } #else static inline int zone_to_nid(struct zone *zone) { return 0; } static inline void zone_set_nid(struct zone *zone, int nid) {} #endif extern int movable_zone; static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); #else return 0; #endif } /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ static inline int is_highmem(struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } #ifdef CONFIG_ZONE_DMA bool has_managed_dma(void); #else static inline bool has_managed_dma(void) { return false; } #endif #ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; static inline struct pglist_data *NODE_DATA(int nid) { return &contig_page_data; } #else /* CONFIG_NUMA */ #include <asm/mmzone.h> #endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat: pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. */ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) #define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) \ if (!populated_zone(zone)) \ ; /* do nothing */ \ else static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; } static inline int zonelist_zone_idx(struct zoneref *zoneref) { return zoneref->zone_idx; } static inline int zonelist_node_idx(struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes); /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z: The cursor used as a starting point for the search * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. * * Return: the next zone at or below highest_zoneidx within the allowed * nodemask using a cursor within a zonelist as a starting point */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); } /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist: The zonelist to search for a suitable zone * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. * * Return: Zoneref pointer for the first suitable zone found */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); } /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone: The current zone in the iterator * @z: The current pointer within zonelist->_zonerefs being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = z->zone; \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone: The current zone in the iterator * @z: The current pointer within zonelist->zones being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) /* Whether the 'nodes' are all movable nodes */ static inline bool movable_only_nodes(nodemask_t *nodes) { struct zonelist *zonelist; struct zoneref *z; int nid; if (nodes_empty(*nodes)) return false; /* * We can chose arbitrary node from the nodemask to get a * zonelist as they are interlinked. We just need to find * at least one zone that can satisfy kernel allocations. */ nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); return (!z->zone) ? true : false; } #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #endif #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif #ifdef CONFIG_SPARSEMEM /* * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) #if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) { return pfn >> PFN_SECTION_SHIFT; } static inline unsigned long section_nr_to_pfn(unsigned long sec) { return sec << PFN_SECTION_SHIFT; } #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) #if SUBSECTION_SHIFT > SECTION_SIZE_BITS #error Subsection size exceeds section size #else #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; void subsection_map_init(unsigned long pfn, unsigned long nr_pages); struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong. */ unsigned long section_mem_map; struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; unsigned long pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. */ }; #ifdef CONFIG_SPARSEMEM_EXTREME #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) #else #define SECTIONS_PER_ROOT 1 #endif #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline unsigned long *section_to_usemap(struct mem_section *ms) { return ms->usage->pageblock_flags; } static inline struct mem_section *__nr_to_section(unsigned long nr) { unsigned long root = SECTION_NR_TO_ROOT(nr); if (unlikely(root >= NR_SECTION_ROOTS)) return NULL; #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section || !mem_section[root]) return NULL; #endif return &mem_section[root][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available on all architectures. * However, we can exceed 6 bits on some other architectures except * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available * with the worst case of 64K pages on arm64) if we make sure the * exceeded bit is not applicable to powerpc. */ enum { SECTION_MARKED_PRESENT_BIT, SECTION_HAS_MEM_MAP_BIT, SECTION_IS_ONLINE_BIT, SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, #endif SECTION_MAP_LAST_BIT, }; #define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) #define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) #define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) #define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { unsigned long map = section->section_mem_map; map &= SECTION_MAP_MASK; return (struct page *)map; } static inline int present_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int present_section_nr(unsigned long nr) { return present_section(__nr_to_section(nr)); } static inline int valid_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int early_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); } static inline int online_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else static inline int online_device_section(struct mem_section *section) { return 0; } #endif static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); } #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } extern unsigned long __highest_present_section_nr; static inline int subsection_map_index(unsigned long pfn) { return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; } #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); return test_bit(idx, ms->usage->subsection_map); } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } #endif #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN * @pfn: the page frame number to check * * Check if there is a valid memory map entry aka struct page for the @pfn. * Note, that availability of the memory map entry does not imply that * there is actual usable memory at that @pfn. The struct page may * represent a hole or an unusable page frame. * * Return: 1 for PFNs that have memory map entries and 0 otherwise */ static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; /* * Ensure the upper PAGE_SHIFT bits are clear in the * pfn. Else it might lead to false positives when * some of the upper bits are set, but the lower bits * match a valid pfn. */ if (PHYS_PFN(PFN_PHYS(pfn)) != pfn) return 0; if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); if (!valid_section(ms)) return 0; /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ return early_section(ms) || pfn_section_valid(ms, pfn); } #endif static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) { while (++section_nr <= __highest_present_section_nr) { if (present_section_nr(section_nr)) return section_nr; } return -1; } /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate * this restriction. */ #ifdef CONFIG_NUMA #define pfn_to_nid(pfn) \ ({ \ unsigned long __pfn_to_nid_pfn = (pfn); \ page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ }) #else #define pfn_to_nid(pfn) (0) #endif void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */
183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. */ #ifndef IB_ADDR_H #define IB_ADDR_H #include <linux/ethtool.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/socket.h> #include <linux/if_vlan.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/ip.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> #include <net/net_namespace.h> /** * struct rdma_dev_addr - Contains resolved RDMA hardware addresses * @src_dev_addr: Source MAC address. * @dst_dev_addr: Destination MAC address. * @broadcast: Broadcast address of the device. * @dev_type: The interface hardware type of the device. * @bound_dev_if: An optional device interface index. * @transport: The transport type used. * @net: Network namespace containing the bound_dev_if net_dev. * @sgid_attr: GID attribute to use for identified SGID */ struct rdma_dev_addr { unsigned char src_dev_addr[MAX_ADDR_LEN]; unsigned char dst_dev_addr[MAX_ADDR_LEN]; unsigned char broadcast[MAX_ADDR_LEN]; unsigned short dev_type; int bound_dev_if; enum rdma_transport_type transport; struct net *net; const struct ib_gid_attr *sgid_attr; enum rdma_network_type network; int hoplimit; }; /** * rdma_translate_ip - Translate a local IP address to an RDMA hardware * address. * * The dev_addr->net field must be initialized. */ int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr); /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. * @dst_addr: The destination address to resolve. * @addr: A reference to a data location that will receive the resolved * addresses. The data location must remain valid until the callback has * been invoked. The net field of the addr struct must be valid. * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9]; } static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey) { dev_addr->broadcast[8] = pkey >> 8; dev_addr->broadcast[9] = (unsigned char) pkey; } static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->broadcast + 4, sizeof *gid); } static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) { return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0; } static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) { return is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 0xffff; } static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) { switch (addr->sa_family) { case AF_INET: ipv6_addr_set_v4mapped(((struct sockaddr_in *) addr)->sin_addr.s_addr, (struct in6_addr *)gid); break; case AF_INET6: *(struct in6_addr *)&gid->raw = ((struct sockaddr_in6 *)addr)->sin6_addr; break; default: return -EINVAL; } return 0; } /* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */ static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid) { if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { struct sockaddr_in *out_in = (struct sockaddr_in *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin_family = AF_INET; memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4); } else { struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out; memset(out_in, 0, sizeof(*out_in)); out_in->sin6_family = AF_INET6; memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16); } } /* * rdma_get/set_sgid/dgid() APIs are applicable to IB, and iWarp. * They are not applicable to RoCE. * RoCE GIDs are derived from the IP addresses. */ static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(*gid)); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid); } static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid); } static inline enum ib_mtu iboe_get_mtu(int mtu) { /* * Reduce IB headers from effective IBoE MTU. */ mtu = mtu - (IB_GRH_BYTES + IB_UDP_BYTES + IB_BTH_BYTES + IB_EXT_XRC_BYTES + IB_EXT_ATOMICETH_BYTES + IB_ICRC_BYTES); if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096)) return IB_MTU_4096; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048)) return IB_MTU_2048; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024)) return IB_MTU_1024; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512)) return IB_MTU_512; else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256)) return IB_MTU_256; else return 0; } static inline int rdma_link_local_addr(struct in6_addr *addr) { if (addr->s6_addr32[0] == htonl(0xfe800000) && addr->s6_addr32[1] == 0) return 1; return 0; } static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) { memcpy(mac, &addr->s6_addr[8], 3); memcpy(mac + 3, &addr->s6_addr[13], 3); mac[0] ^= 2; } static inline int rdma_is_multicast_addr(struct in6_addr *addr) { __be32 ipv4_addr; if (addr->s6_addr[0] == 0xff) return 1; ipv4_addr = addr->s6_addr32[3]; return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr)); } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) { int i; mac[0] = 0x33; mac[1] = 0x33; for (i = 2; i < 6; ++i) mac[i] = addr->s6_addr[i + 10]; } static inline u16 rdma_get_vlan_id(union ib_gid *dgid) { u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; return vid < 0x1000 ? vid : 0xffff; } static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) { return is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : NULL; } #endif /* IB_ADDR_H */
66 410 410 6809 6809 6688 31 6809 6810 31 6810 410 65 6690 5273 6690 644 6692 6677 4056 6691 372 371 371 368 371 371 371 371 352 6688 6201 6692 1835 1736 6691 6689 6689 6676 6200 8 8 187 187 1 6806 6807 186 6688 6691 23 562 563 563 6615 304 6617 2342 6614 5937 5822 4142 4930 3380 5940 6201 4035 4027 1969 311 305 3638 1335 3609 6037 5236 878 3933 5025 5046 372 349 287 251 372 372 371 6468 5306 6465 5760 6465 6113 354 628 6354 6692 4383 6470 6690 6465 659 6467 5797 5769 157 437 628 326 240 355 259 152 628 347 6691 6690 633 378 6691 349 348 394 410 131 392 409 409 410 59 53 59 11 60 62 62 187 6689 6689 6689 6691 5578 5045 4563 5578 6692 6690 6691 6690 6692 5045 6690 5370 6690 6691 409 410 8 8 6 6 65 65 410 410 409 6690 5741 6690 5511 409 6691 6692 6697 6691 6690 25 6695 6665 6665 6692 6692 6688 6690 25 6690 6689 8 8 6666 6664 65 65 65 6664 6689 6691 6692 6213 2692 454 408 126 410 1 1 408 503 503 502 76 503 449 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * The percpu allocator handles both static and dynamic areas. Percpu * areas are allocated in chunks which are divided into units. There is * a 1-to-1 mapping for units to possible cpus. These units are grouped * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done by offsets into a unit's address space. Ie., an * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear * and even sparse. Access is handled by configuring percpu base * registers according to the cpu to unit mappings and offsetting the * base address using pcpu_unit_size. * * There is special consideration for the first chunk which must handle * the static percpu variables in the kernel image as allocation services * are not online yet. In short, the first chunk is structured like so: * * <Static | [Reserved] | Dynamic> * * The static data is copied from the original section managed by the * linker. The reserved section, if non-zero, primarily manages static * percpu variables from kernel modules. Finally, the dynamic section * takes care of normal allocations. * * The allocator organizes chunks into lists according to free size and * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT * flag should be passed. All memcg-aware allocations are sharing one set * of chunks and all unaccounted allocations and allocations performed * by processes belonging to the root memory cgroup are using the second set. * * The allocator tries to allocate from the fullest chunk first. Each chunk * is managed by a bitmap with metadata blocks. The allocation map is updated * on every allocation and free to reflect the current state while the boundary * map is only updated on allocation. Each metadata block contains * information to help mitigate the need to iterate over large portions * of the bitmap. The reverse mapping from page to chunk is stored in * the page's index. Lastly, units are lazily backed and grow in unison. * * There is a unique conversion that goes on here between bytes and bits. * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk * tracks the number of pages it is responsible for in nr_pages. Helper * functions are used to convert from between the bytes, bits, and blocks. * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> #include <linux/cpumask.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/pfn.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/kmemleak.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/memcontrol.h> #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/io.h> #define CREATE_TRACE_POINTS #include <trace/events/percpu.h> #include "percpu-internal.h" /* * The slots are sorted by the size of the biggest continuous free area. * 1-31 bytes share the same slot. */ #define PCPU_SLOT_BASE_SHIFT 5 /* chunks in slots below this are subject to being sidelined on failed alloc */ #define PCPU_SLOT_FAIL_THRESHOLD 3 #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ (void __percpu *)((unsigned long)(addr) - \ (unsigned long)pcpu_base_addr + \ (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ (void __force *)((unsigned long)(ptr) + \ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif #else /* CONFIG_SMP */ /* on UP, it's always identity mapped */ #define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif /* CONFIG_SMP */ static int pcpu_unit_pages __ro_after_init; static int pcpu_unit_size __ro_after_init; static int pcpu_nr_units __ro_after_init; static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; static int pcpu_free_slot __ro_after_init; int pcpu_sidelined_slot __ro_after_init; int pcpu_to_depopulate_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ static unsigned int pcpu_low_unit_cpu __ro_after_init; static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init; static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ /* group information, used for vm allocation */ static int pcpu_nr_groups __ro_after_init; static const unsigned long *pcpu_group_offsets __ro_after_init; static const size_t *pcpu_group_sizes __ro_after_init; /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first * chunk and serves it for reserved allocations. When the reserved * region doesn't exist, the following variable is NULL. */ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ /* * The number of empty populated pages, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count. */ int pcpu_nr_empty_pop_pages; /* * The number of populated pages in use by the allocator, protected by * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets * allocated/deallocated, it is allocated/deallocated in all units of a chunk * and increments/decrements this count by 1). */ static unsigned long pcpu_nr_populated; /* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one * empty chunk. */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); static bool pcpu_async_enabled __read_mostly; static bool pcpu_atomic_alloc_failed; static void pcpu_schedule_balance_work(void) { if (pcpu_async_enabled) schedule_work(&pcpu_balance_work); } /** * pcpu_addr_in_chunk - check if the address is served from this chunk * @chunk: chunk of interest * @addr: percpu address * * RETURNS: * True if the address is served from this chunk. */ static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) { void *start_addr, *end_addr; if (!chunk) return false; start_addr = chunk->base_addr + chunk->start_offset; end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - chunk->end_offset; return addr >= start_addr && addr < end_addr; } static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } static int pcpu_size_to_slot(int size) { if (size == pcpu_unit_size) return pcpu_free_slot; return __pcpu_size_to_slot(size); } static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { const struct pcpu_block_md *chunk_md = &chunk->chunk_md; if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk_md->contig_hint == 0) return 0; return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); } /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { page->index = (unsigned long)pcpu; } /* obtain pointer to a chunk from a page struct */ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { return (struct pcpu_chunk *)page->index; } static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) { return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) { return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { return (unsigned long)chunk->base_addr + pcpu_unit_page_offset(cpu, page_idx); } /* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets. */ static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) { return chunk->alloc_map + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); } static unsigned long pcpu_off_to_block_index(int off) { return off / PCPU_BITMAP_BLOCK_BITS; } static unsigned long pcpu_off_to_block_off(int off) { return off & (PCPU_BITMAP_BLOCK_BITS - 1); } static unsigned long pcpu_block_off_to_off(int index, int off) { return index * PCPU_BITMAP_BLOCK_BITS + off; } /** * pcpu_check_block_hint - check against the contig hint * @block: block of interest * @bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * * Check to see if the allocation can fit in the block's contig hint. * Note, a chunk uses the same hints as a block so this can also check against * the chunk's contig hint. */ static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits, size_t align) { int bit_off = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; return bit_off + bits <= block->contig_hint; } /* * pcpu_next_hint - determine which hint to use * @block: block of interest * @alloc_bits: size of allocation * * This determines if we should scan based on the scan_hint or first_free. * In general, we want to scan from first_free to fulfill allocations by * first fit. However, if we know a scan_hint at position scan_hint_start * cannot fulfill an allocation, we can begin scanning from there knowing * the contig_hint will be our fallback. */ static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) { /* * The three conditions below determine if we can skip past the * scan_hint. First, does the scan hint exist. Second, is the * contig_hint after the scan_hint (possibly not true iff * contig_hint == scan_hint). Third, is the allocation request * larger than the scan_hint. */ if (block->scan_hint && block->contig_hint_start > block->scan_hint_start && alloc_bits > block->scan_hint) return block->scan_hint_start + block->scan_hint; return block->first_free; } /** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Helper function for pcpu_for_each_md_free_region. It checks * block->contig_hint and performs aggregation across blocks to find the * next hint. It modifies bit_off and bits in-place to be consumed in the * loop. */ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; return; } /* * This checks three things. First is there a contig_hint to * check. Second, have we checked this hint before by * comparing the block_off. Third, is this the same as the * right contig hint. In the last case, it spills over into * the next block and should be handled by the contig area * across blocks code. */ *bits = block->contig_hint; if (*bits && block->contig_hint_start >= block_off && *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { *bit_off = pcpu_block_off_to_off(i, block->contig_hint_start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bits = block->right_free; *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; } } /** * pcpu_next_fit_region - finds fit areas for a given allocation request * @chunk: chunk of interest * @alloc_bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * @bit_off: chunk offset * @bits: size of free area * * Finds the next free region that is viable for use with a given size and * alignment. This only returns if there is a valid area to be used for this * allocation. block->first_free is returned if the allocation request fits * within the block to see if the request can be fulfilled prior to the contig * hint. */ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, int align, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (*bits >= alloc_bits) return; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; } /* check block->contig_hint */ *bits = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; /* * This uses the block offset to determine if this has been * checked in the prior iteration. */ if (block->contig_hint && block->contig_hint_start >= block_off && block->contig_hint >= *bits + alloc_bits) { int start = pcpu_next_hint(block, alloc_bits); *bits += alloc_bits + block->contig_hint_start - start; *bit_off = pcpu_block_off_to_off(i, start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, align); *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; *bit_off = pcpu_block_off_to_off(i, *bit_off); if (*bits >= alloc_bits) return; } /* no valid offsets were found - fail condition */ *bit_off = pcpu_chunk_map_bits(chunk); } /* * Metadata free area iterators. These perform aggregation of free areas * based on the metadata blocks and return the offset @bit_off and size in * bits of the free area @bits. pcpu_for_each_fit_region only returns when * a fit is found for the allocation request. */ #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits) + 1, \ pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits), \ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits))) /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) return kzalloc(size, gfp); else return __vmalloc(size, gfp | __GFP_ZERO); } /** * pcpu_mem_free - free memory * @ptr: memory to free * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr) { kvfree(ptr); } static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, bool move_front) { if (chunk != pcpu_reserved_chunk) { if (move_front) list_move(&chunk->list, &pcpu_chunk_lists[slot]); else list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]); } } static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) { __pcpu_chunk_move(chunk, slot, true); } /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. * * CONTEXT: * pcpu_lock. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); /* leave isolated chunks in-place */ if (chunk->isolated) return; if (oslot != nslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot); } static void pcpu_isolate_chunk(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); if (!chunk->isolated) { chunk->isolated = true; pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages; } list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]); } static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); if (chunk->isolated) { chunk->isolated = false; pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages; pcpu_chunk_relocate(chunk, -1); } } /* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest * @nr: nr of empty pages * * This is used to keep track of the empty pages now based on the premise * a md_block covers a page. The hint update functions recognize if a block * is made full or broken to calculate deltas for keeping track of free pages. */ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk && !chunk->isolated) pcpu_nr_empty_pop_pages += nr; } /* * pcpu_region_overlap - determines if two regions overlap * @a: start of first region, inclusive * @b: end of first region, exclusive * @x: start of second region, inclusive * @y: end of second region, exclusive * * This is used to determine if the hint region [a, b) overlaps with the * allocated region [x, y). */ static inline bool pcpu_region_overlap(int a, int b, int x, int y) { return (a < y) && (x < b); } /** * pcpu_block_update - updates a block given a free area * @block: block of interest * @start: start offset in block * @end: end offset in block * * Updates a block given a known free area. The region [start, end) is * expected to be the entirety of the free area within a block. Chooses * the best starting offset if the contig hints are equal. */ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) { int contig = end - start; block->first_free = min(block->first_free, start); if (start == 0) block->left_free = contig; if (end == block->nr_bits) block->right_free = contig; if (contig > block->contig_hint) { /* promote the old contig_hint to be the new scan_hint */ if (start > block->contig_hint_start) { if (block->contig_hint > block->scan_hint) { block->scan_hint_start = block->contig_hint_start; block->scan_hint = block->contig_hint; } else if (start < block->scan_hint_start) { /* * The old contig_hint == scan_hint. But, the * new contig is larger so hold the invariant * scan_hint_start < contig_hint_start. */ block->scan_hint = 0; } } else { block->scan_hint = 0; } block->contig_hint_start = start; block->contig_hint = contig; } else if (contig == block->contig_hint) { if (block->contig_hint_start && (!start || __ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */ block->contig_hint_start = start; if (start < block->scan_hint_start && block->contig_hint > block->scan_hint) block->scan_hint = 0; } else if (start > block->scan_hint_start || block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint. */ block->scan_hint_start = start; block->scan_hint = contig; } } else { /* * The region is smaller than the contig_hint. So only update * the scan_hint if it is larger than or equal and farther than * the current scan_hint. */ if ((start < block->contig_hint_start && (contig > block->scan_hint || (contig == block->scan_hint && start > block->scan_hint_start)))) { block->scan_hint_start = start; block->scan_hint = contig; } } } /* * pcpu_block_update_scan - update a block given a free area from a scan * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Finding the final allocation spot first goes through pcpu_find_block_fit() * to find a block that can hold the allocation and then pcpu_alloc_area() * where a scan is used. When allocations require specific alignments, * we can inadvertently create holes which will not be seen in the alloc * or free paths. * * This takes a given free area hole and updates a block as it may change the * scan_hint. We need to scan backwards to ensure we don't miss free bits * from alignment. */ static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, int bits) { int s_off = pcpu_off_to_block_off(bit_off); int e_off = s_off + bits; int s_index, l_bit; struct pcpu_block_md *block; if (e_off > PCPU_BITMAP_BLOCK_BITS) return; s_index = pcpu_off_to_block_index(bit_off); block = chunk->md_blocks + s_index; /* scan backwards in case of alignment skipping free bits */ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); s_off = (s_off == l_bit) ? 0 : l_bit + 1; pcpu_block_update(block, s_off, e_off); } /** * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest * @full_scan: if we should scan from the beginning * * Iterates over the metadata blocks to find the largest contig area. * A full scan can be avoided on the allocation path as this is triggered * if we broke the contig_hint. In doing so, the scan_hint will be before * the contig_hint or after if the scan_hint == contig_hint. This cannot * be prevented on freeing as we want to find the largest area possibly * spanning blocks. */ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits; /* promote scan_hint to contig_hint */ if (!full_scan && chunk_md->scan_hint) { bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; chunk_md->contig_hint_start = chunk_md->scan_hint_start; chunk_md->contig_hint = chunk_md->scan_hint; chunk_md->scan_hint = 0; } else { bit_off = chunk_md->first_free; chunk_md->contig_hint = 0; } bits = 0; pcpu_for_each_md_free_region(chunk, bit_off, bits) pcpu_block_update(chunk_md, bit_off, bit_off + bits); } /** * pcpu_block_refresh_hint * @chunk: chunk of interest * @index: index of the metadata block * * Scans over the block beginning at first_free and updates the block * metadata accordingly. */ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); unsigned int start, end; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { start = block->scan_hint_start + block->scan_hint; block->contig_hint_start = block->scan_hint_start; block->contig_hint = block->scan_hint; block->scan_hint = 0; } else { start = block->first_free; block->contig_hint = 0; } block->right_free = 0; /* iterate over free areas and update the contig hints */ for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) pcpu_block_update(block, start, end); } /** * pcpu_block_update_hint_alloc - update hint on allocation path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. The metadata only has to be * refreshed by a full scan iff the chunk's contig hint is broken. Block level * scans are required if the block's contig hint is broken. */ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Update s_block. */ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * block->first_free must be updated if the allocation takes its place. * If the allocation breaks the contig_hint, a scan is required to * restore this hint. */ if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), PCPU_BITMAP_BLOCK_BITS, s_off + bits); if (pcpu_region_overlap(s_block->scan_hint_start, s_block->scan_hint_start + s_block->scan_hint, s_off, s_off + bits)) s_block->scan_hint = 0; if (pcpu_region_overlap(s_block->contig_hint_start, s_block->contig_hint_start + s_block->contig_hint, s_off, s_off + bits)) { /* block contig hint is broken - scan to fix it */ if (!s_off) s_block->left_free = 0; pcpu_block_refresh_hint(chunk, s_index); } else { /* update left and right contig manually */ s_block->left_free = min(s_block->left_free, s_off); if (s_index == e_index) s_block->right_free = min_t(int, s_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); else s_block->right_free = 0; } /* * Update e_block. */ if (s_index != e_index) { if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * When the allocation is across blocks, the end is along * the left part of the e_block. */ e_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, e_off); if (e_off == PCPU_BITMAP_BLOCK_BITS) { /* reset the block */ e_block++; } else { if (e_off > e_block->scan_hint_start) e_block->scan_hint = 0; e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, e_index); } else { e_block->right_free = min_t(int, e_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); } } /* update in-between md_blocks */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->scan_hint = 0; block->contig_hint = 0; block->left_free = 0; block->right_free = 0; } } /* * If the allocation is not atomic, some blocks may not be * populated with pages, while we account it here. The number * of pages will be added back with pcpu_chunk_populated() * when populating pages. */ if (nr_empty_pages) pcpu_update_empty_pages(chunk, -nr_empty_pages); if (pcpu_region_overlap(chunk_md->scan_hint_start, chunk_md->scan_hint_start + chunk_md->scan_hint, bit_off, bit_off + bits)) chunk_md->scan_hint = 0; /* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct. */ if (pcpu_region_overlap(chunk_md->contig_hint_start, chunk_md->contig_hint_start + chunk_md->contig_hint, bit_off, bit_off + bits)) pcpu_chunk_refresh_hint(chunk, false); } /** * pcpu_block_update_hint_free - updates the block hints on the free path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. This avoids a blind block * refresh by making use of the block contig hints. If this fails, it scans * forward and backward to determine the extent of the free area. This is * capped at the boundary of blocks. * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating * over the block metadata to update chunk_md->contig_hint. * chunk_md->contig_hint may be off by up to a page, but it will never be more * than the available space. If the contig hint is contained in one block, it * will be accurate. */ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) { int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ int start, end; /* start and end of the whole free area */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Check if the freed area aligns with the block->contig_hint. * If it does, then the scan to find the beginning/end of the * larger free area can be avoided. * * start and end refer to beginning and end of the free area * within each their respective blocks. This is not necessarily * the entire free area as it may span blocks past the beginning * or end of the block. */ start = s_off; if (s_off == s_block->contig_hint + s_block->contig_hint_start) { start = s_block->contig_hint_start; } else { /* * Scan backwards to find the extent of the free area. * find_last_bit returns the starting bit, so if the start bit * is returned, that means there was no last bit and the * remainder of the chunk is free. */ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), start); start = (start == l_bit) ? 0 : l_bit + 1; } end = e_off; if (e_off == e_block->contig_hint_start) end = e_block->contig_hint_start + e_block->contig_hint; else end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, end); /* update s_block */ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(s_block, start, e_off); /* freeing in the same block */ if (s_index != e_index) { /* update e_block */ if (end == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(e_block, 0, end); /* reset md_blocks in the middle */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->first_free = 0; block->scan_hint = 0; block->contig_hint_start = 0; block->contig_hint = PCPU_BITMAP_BLOCK_BITS; block->left_free = PCPU_BITMAP_BLOCK_BITS; block->right_free = PCPU_BITMAP_BLOCK_BITS; } } if (nr_empty_pages) pcpu_update_empty_pages(chunk, nr_empty_pages); /* * Refresh chunk metadata when the free makes a block free or spans * across blocks. The contig_hint may be off by up to a page, but if * the contig_hint is contained in a block, it will be accurate with * the else condition below. */ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) pcpu_chunk_refresh_hint(chunk, true); else pcpu_block_update(&chunk->chunk_md, pcpu_block_off_to_off(s_index, start), end); } /** * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of area * @next_off: return value for the next offset to start searching * * For atomic allocations, check if the backing pages are populated. * * RETURNS: * Bool if the backing pages are populated. * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. */ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { unsigned int start, end; start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); start = find_next_zero_bit(chunk->populated, end, start); if (start >= end) return true; end = find_next_bit(chunk->populated, end, start + 1); *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } /** * pcpu_find_block_fit - finds the block index to start searching * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE bytes) * @pop_only: use populated regions only * * Given a chunk and an allocation spec, find the offset to begin searching * for a free region. This iterates over the bitmap metadata blocks to * find an offset that will be guaranteed to fit the requirements. It is * not quite first fit as if the allocation does not fit in the contig hint * of a block or chunk, it is skipped. This errs on the side of caution * to prevent excess iteration. Poor alignment can cause the allocator to * skip over blocks and chunks that have valid free areas. * * RETURNS: * The offset in the bitmap to begin searching. * -1 if no offset is found. */ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, size_t align, bool pop_only) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off; /* * This is an optimization to prevent scanning by assuming if the * allocation cannot fit in the global hint, there is memory pressure * and creating a new chunk would happen soon. */ if (!pcpu_check_block_hint(chunk_md, alloc_bits, align)) return -1; bit_off = pcpu_next_hint(chunk_md, alloc_bits); bits = 0; pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, &next_off)) break; bit_off = next_off; bits = 0; } if (bit_off == pcpu_chunk_map_bits(chunk)) return -1; return bit_off; } /* * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() * @map: the address to base the search on * @size: the bitmap size in bits * @start: the bitnumber to start searching at * @nr: the number of zeroed bits we're looking for * @align_mask: alignment mask for zero area * @largest_off: offset of the largest area skipped * @largest_bits: size of the largest area skipped * * The @align_mask should be one less than a power of 2. * * This is a modified version of bitmap_find_next_zero_area_off() to remember * the largest area that was skipped. This is imperfect, but in general is * good enough. The largest remembered region is the largest failed region * seen. This does not include anything we possibly skipped due to alignment. * pcpu_block_update_scan() does scan backwards to try and recover what was * lost to alignment. While this can cause scanning to miss earlier possible * free areas, smaller allocations will eventually fill those holes. */ static unsigned long pcpu_find_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned long nr, unsigned long align_mask, unsigned long *largest_off, unsigned long *largest_bits) { unsigned long index, end, i, area_off, area_bits; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index, align_mask); area_off = index; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { area_bits = i - area_off; /* remember largest unused area with best alignment */ if (area_bits > *largest_bits || (area_bits == *largest_bits && *largest_off && (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { *largest_off = area_off; *largest_bits = area_bits; } start = i + 1; goto again; } return index; } /** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE) * @start: bit_off to start searching * * This function takes in a @start offset to begin searching to fit an * allocation of @alloc_bits with alignment @align. It needs to scan * the allocation map because if it fits within the block's contig hint, * @start will be block->first_free. This is an attempt to fill the * allocation prior to breaking the contig hint. The allocation and * boundary maps are updated accordingly if it confirms a valid * free area. * * RETURNS: * Allocated addr offset in @chunk on success. * -1 if no matching area is found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, size_t align, int start) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; size_t align_mask = (align) ? (align - 1) : 0; unsigned long area_off = 0, area_bits = 0; int bit_off, end, oslot; lockdep_assert_held(&pcpu_lock); oslot = pcpu_chunk_slot(chunk); /* * Search to find a fit. */ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, pcpu_chunk_map_bits(chunk)); bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, align_mask, &area_off, &area_bits); if (bit_off >= end) return -1; if (area_bits) pcpu_block_update_scan(chunk, area_off, area_bits); /* update alloc map */ bitmap_set(chunk->alloc_map, bit_off, alloc_bits); /* update boundary map */ set_bit(bit_off, chunk->bound_map); bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); set_bit(bit_off + alloc_bits, chunk->bound_map); chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ if (bit_off == chunk_md->first_free) chunk_md->first_free = find_next_zero_bit( chunk->alloc_map, pcpu_chunk_map_bits(chunk), bit_off + alloc_bits); pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); pcpu_chunk_relocate(chunk, oslot); return bit_off * PCPU_MIN_ALLOC_SIZE; } /** * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest * @off: addr offset into chunk * * This function determines the size of an allocation to free using * the boundary bitmap and clears the allocation map. * * RETURNS: * Number of freed bytes. */ static int pcpu_free_area(struct pcpu_chunk *chunk, int off) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot, freed; lockdep_assert_held(&pcpu_lock); pcpu_stats_area_dealloc(chunk); oslot = pcpu_chunk_slot(chunk); bit_off = off / PCPU_MIN_ALLOC_SIZE; /* find end index */ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), bit_off + 1); bits = end - bit_off; bitmap_clear(chunk->alloc_map, bit_off, bits); freed = bits * PCPU_MIN_ALLOC_SIZE; /* update metadata */ chunk->free_bytes += freed; /* update first free bit */ chunk_md->first_free = min(chunk_md->first_free, bit_off); pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); return freed; } static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) { block->scan_hint = 0; block->contig_hint = nr_bits; block->left_free = nr_bits; block->right_free = nr_bits; block->first_free = 0; block->nr_bits = nr_bits; } static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) { struct pcpu_block_md *md_block; /* init the chunk's block */ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); md_block++) pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); } /** * pcpu_alloc_first_chunk - creates chunks that serve the first chunk * @tmp_addr: the start of the region served * @map_size: size of the region served * * This is responsible for creating the chunks that serve the first chunk. The * base_addr is page aligned down of @tmp_addr while the region end is page * aligned up. Offsets are kept track of to determine the region served. All * this is done to appease the bitmap allocator in avoiding partial blocks. * * RETURNS: * Chunk serving the region at @tmp_addr of @map_size. */ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size) { struct pcpu_chunk *chunk; unsigned long aligned_addr; int start_offset, offset_bits, region_size, region_bits; size_t alloc_size; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; start_offset = tmp_addr - aligned_addr; region_size = ALIGN(start_offset + map_size, PAGE_SIZE); /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); INIT_LIST_HEAD(&chunk->list); chunk->base_addr = (void *)aligned_addr; chunk->start_offset = start_offset; chunk->end_offset = region_size - chunk->start_offset - map_size; chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->alloc_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->bound_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->md_blocks) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); #ifdef CONFIG_MEMCG_KMEM /* first chunk is free to use */ chunk->obj_cgroups = NULL; #endif pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; chunk->nr_empty_pop_pages = chunk->nr_pages; chunk->free_bytes = map_size; if (chunk->start_offset) { /* hide the beginning of the bitmap */ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, 0, offset_bits); set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); chunk->chunk_md.first_free = offset_bits; pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } if (chunk->end_offset) { /* hide the end of the bitmap */ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, chunk->bound_map); set_bit(region_bits, chunk->bound_map); pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); } return chunk; } static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); if (!chunk) return NULL; INIT_LIST_HEAD(&chunk->list); chunk->nr_pages = pcpu_unit_pages; region_bits = pcpu_chunk_map_bits(chunk); chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), gfp); if (!chunk->alloc_map) goto alloc_map_fail; chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), gfp); if (!chunk->bound_map) goto bound_map_fail; chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), gfp); if (!chunk->md_blocks) goto md_blocks_fail; #ifdef CONFIG_MEMCG_KMEM if (!mem_cgroup_kmem_disabled()) { chunk->obj_cgroups = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * sizeof(struct obj_cgroup *), gfp); if (!chunk->obj_cgroups) goto objcg_fail; } #endif pcpu_init_md_blocks(chunk); /* init metadata */ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; #ifdef CONFIG_MEMCG_KMEM objcg_fail: pcpu_mem_free(chunk->md_blocks); #endif md_blocks_fail: pcpu_mem_free(chunk->bound_map); bound_map_fail: pcpu_mem_free(chunk->alloc_map); alloc_map_fail: pcpu_mem_free(chunk); return NULL; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; #ifdef CONFIG_MEMCG_KMEM pcpu_mem_free(chunk->obj_cgroups); #endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); } /** * pcpu_chunk_populated - post-population bookkeeping * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; pcpu_nr_populated += nr; pcpu_update_empty_pages(chunk, nr); } /** * pcpu_chunk_depopulated - post-depopulation bookkeeping * @chunk: pcpu_chunk which got depopulated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been depopulated from @chunk. * Update the bookkeeping information accordingly. Must be called after * each successful depopulation. */ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; pcpu_nr_populated -= nr; pcpu_update_empty_pages(chunk, -nr); } /* * Chunk management implementation. * * To allow different implementations, chunk alloc/free and * [de]population are implemented in a separate file which is pulled * into this file and compiled together. The following functions * should be implemented. * * pcpu_populate_chunk - populate the specified range of a chunk * pcpu_depopulate_chunk - depopulate the specified range of a chunk * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk * pcpu_create_chunk - create a new chunk * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); #ifdef CONFIG_NEED_PER_CPU_KM #include "percpu-km.c" #else #include "percpu-vm.c" #endif /** * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * * This is an internal function that handles all but static allocations. * Static percpu address values should never be passed into the allocator. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { /* is it in the dynamic region (first chunk)? */ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk; /* is it in the reserved region? */ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) return pcpu_reserved_chunk; /* * The address is relative to unit0 which might be unused and * thus unmapped. Offset the address to the unit space of the * current processor before looking it up in the vmalloc * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } #ifdef CONFIG_MEMCG_KMEM static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { struct obj_cgroup *objcg; if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; objcg = current_obj_cgroup(); if (!objcg) return true; if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) return false; *objcgp = objcg; return true; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { if (!objcg) return; if (likely(chunk && chunk->obj_cgroups)) { obj_cgroup_get(objcg); chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, pcpu_obj_full_size(size)); rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); } } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { struct obj_cgroup *objcg; if (unlikely(!chunk->obj_cgroups)) return; objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; if (!objcg) return; chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); } #else /* CONFIG_MEMCG_KMEM */ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { return true; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } #endif /* CONFIG_MEMCG_KMEM */ /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * @gfp: allocation flags * * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN * then no warning will be triggered on invalid or failed allocation * requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { gfp_t pcpu_gfp; bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; static int warn_limit = 10; struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; unsigned long flags; void __percpu *ptr; size_t bits, bit_align; gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; do_warn = !(gfp & __GFP_NOWARN); /* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. * An allocation may have internal fragmentation from rounding up * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. */ if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) align = PCPU_MIN_ALLOC_SIZE; size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); bits = size >> PCPU_MIN_ALLOC_SHIFT; bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n", size, align); return NULL; } if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) return NULL; if (!is_atomic) { /* * pcpu_balance_workfn() allocates memory under this mutex, * and it may wait for memory reclaim. Allow current task * to become OOM victim, in case of memory pressure. */ if (gfp & __GFP_NOFAIL) { mutex_lock(&pcpu_alloc_mutex); } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } } spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { err = "alloc from reserved chunk failed"; goto fail_unlock; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; err = "alloc from reserved chunk failed"; goto fail_unlock; } restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) { list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { if (slot < PCPU_SLOT_FAIL_THRESHOLD) pcpu_chunk_move(chunk, 0); continue; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) { pcpu_reintegrate_chunk(chunk); goto area_found; } } } spin_unlock_irqrestore(&pcpu_lock, flags); if (is_atomic) { err = "atomic alloc failed, no space left"; goto fail; } /* No space left. Create a new chunk. */ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) { chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); } else { spin_lock_irqsave(&pcpu_lock, flags); } goto restart; area_found: pcpu_stats_area_alloc(chunk, size); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ if (!is_atomic) { unsigned int page_end, rs, re; rs = PFN_DOWN(off); page_end = PFN_UP(off + size); for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_