| 6 6 5 5 1 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ufs/util.c * * Copyright (C) 1998 * Daniel Pirkl <daniel.pirkl@email.cz> * Charles University, Faculty of Mathematics and Physics */ #include <linux/string.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include "ufs_fs.h" #include "ufs.h" #include "swab.h" #include "util.h" struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi, struct super_block *sb, u64 fragment, u64 size) { struct ufs_buffer_head * ubh; unsigned i, j ; u64 count = 0; if (size & ~uspi->s_fmask) return NULL; count = size >> uspi->s_fshift; if (count > UFS_MAXFRAG) return NULL; ubh = kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS); if (!ubh) return NULL; ubh->fragment = fragment; ubh->count = count; for (i = 0; i < count; i++) if (!(ubh->bh[i] = sb_bread(sb, fragment + i))) goto failed; for (; i < UFS_MAXFRAG; i++) ubh->bh[i] = NULL; return ubh; failed: for (j = 0; j < i; j++) brelse (ubh->bh[j]); kfree(ubh); return NULL; } struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi, struct super_block *sb, u64 fragment, u64 size) { unsigned i, j; u64 count = 0; if (size & ~uspi->s_fmask) return NULL; count = size >> uspi->s_fshift; if (count <= 0 || count > UFS_MAXFRAG) return NULL; USPI_UBH(uspi)->fragment = fragment; USPI_UBH(uspi)->count = count; for (i = 0; i < count; i++) if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i))) goto failed; for (; i < UFS_MAXFRAG; i++) USPI_UBH(uspi)->bh[i] = NULL; return USPI_UBH(uspi); failed: for (j = 0; j < i; j++) brelse (USPI_UBH(uspi)->bh[j]); return NULL; } void ubh_brelse (struct ufs_buffer_head * ubh) { unsigned i; if (!ubh) return; for (i = 0; i < ubh->count; i++) brelse (ubh->bh[i]); kfree (ubh); } void ubh_brelse_uspi (struct ufs_sb_private_info * uspi) { unsigned i; if (!USPI_UBH(uspi)) return; for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) { brelse (USPI_UBH(uspi)->bh[i]); USPI_UBH(uspi)->bh[i] = NULL; } } void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh) { unsigned i; if (!ubh) return; for ( i = 0; i < ubh->count; i++ ) mark_buffer_dirty (ubh->bh[i]); } void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag) { unsigned i; if (!ubh) return; if (flag) { for ( i = 0; i < ubh->count; i++ ) set_buffer_uptodate (ubh->bh[i]); } else { for ( i = 0; i < ubh->count; i++ ) clear_buffer_uptodate (ubh->bh[i]); } } void ubh_sync_block(struct ufs_buffer_head *ubh) { if (ubh) { unsigned i; for (i = 0; i < ubh->count; i++) write_dirty_buffer(ubh->bh[i], 0); for (i = 0; i < ubh->count; i++) wait_on_buffer(ubh->bh[i]); } } void ubh_bforget (struct ufs_buffer_head * ubh) { unsigned i; if (!ubh) return; for ( i = 0; i < ubh->count; i++ ) if ( ubh->bh[i] ) bforget (ubh->bh[i]); } int ubh_buffer_dirty (struct ufs_buffer_head * ubh) { unsigned i; unsigned result = 0; if (!ubh) return 0; for ( i = 0; i < ubh->count; i++ ) result |= buffer_dirty(ubh->bh[i]); return result; } void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi, unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size) { unsigned len, bhno; if (size > (ubh->count << uspi->s_fshift)) size = ubh->count << uspi->s_fshift; bhno = 0; while (size) { len = min_t(unsigned int, size, uspi->s_fsize); memcpy (mem, ubh->bh[bhno]->b_data, len); mem += uspi->s_fsize; size -= len; bhno++; } } void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size) { unsigned len, bhno; if (size > (ubh->count << uspi->s_fshift)) size = ubh->count << uspi->s_fshift; bhno = 0; while (size) { len = min_t(unsigned int, size, uspi->s_fsize); memcpy (ubh->bh[bhno]->b_data, mem, len); mem += uspi->s_fsize; size -= len; bhno++; } } dev_t ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi) { __u32 fs32; dev_t dev; if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]); else fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]); switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNx86: case UFS_ST_SUN: if ((fs32 & 0xffff0000) == 0 || (fs32 & 0xffff0000) == 0xffff0000) dev = old_decode_dev(fs32 & 0x7fff); else dev = MKDEV(sysv_major(fs32), sysv_minor(fs32)); break; default: dev = old_decode_dev(fs32); break; } return dev; } void ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev) { __u32 fs32; switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) { case UFS_ST_SUNx86: case UFS_ST_SUN: fs32 = sysv_encode_dev(dev); if ((fs32 & 0xffff8000) == 0) { fs32 = old_encode_dev(dev); } break; default: fs32 = old_encode_dev(dev); break; } if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86) ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32); else ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32); } /** * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist * read it from disk. * @mapping: the address_space to search * @index: the page index * * Locates the desired pagecache folio, if not exist we'll read it, * locks it, increments its reference * count and returns its address. * */ struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index) { struct inode *inode = mapping->host; struct folio *folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { folio = read_mapping_folio(mapping, index, NULL); if (IS_ERR(folio)) { printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n", mapping->host->i_ino, index); return folio; } folio_lock(folio); if (unlikely(folio->mapping == NULL)) { /* Truncate got there first */ folio_unlock(folio); folio_put(folio); return NULL; } } if (!folio_buffers(folio)) create_empty_buffers(folio, 1 << inode->i_blkbits, 0); return folio; } |
| 11589 11600 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); static __always_inline bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) { /* make sure it's not in the stack proper */ if (get_stack_info_noinstr(stack, current, info)) return false; /* but if it is in the page below it, we hit a guard */ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); } const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */ |
| 3 1 3 3 3 3 3 1 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 | // SPDX-License-Identifier: GPL-2.0 #include <linux/in.h> #include <linux/inet.h> #include <linux/list.h> #include <linux/module.h> #include <linux/net.h> #include <linux/proc_fs.h> #include <linux/rculist.h> #include <linux/seq_file.h> #include <linux/socket.h> #include <net/inet_sock.h> #include <net/kcm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/tcp.h> #ifdef CONFIG_PROC_FS static struct kcm_mux *kcm_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); return list_first_or_null_rcu(&knet->mux_list, struct kcm_mux, kcm_mux_list); } static struct kcm_mux *kcm_get_next(struct kcm_mux *mux) { struct kcm_net *knet = mux->knet; return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list, struct kcm_mux, kcm_mux_list); } static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); struct kcm_mux *m; list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) { if (!pos) return m; --pos; } return NULL; } static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos) { void *p; if (v == SEQ_START_TOKEN) p = kcm_get_first(seq); else p = kcm_get_next(v); ++*pos; return p; } static void *kcm_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; else return kcm_get_idx(seq, *pos - 1); } static void kcm_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } struct kcm_proc_mux_state { struct seq_net_private p; int idx; }; static void kcm_format_mux_header(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); seq_printf(seq, "*** KCM statistics (%d MUX) ****\n", knet->count); seq_printf(seq, "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s", "Object", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "Recv-Q", "Rmem", "Send-Q", "Smem", "Status"); /* XXX: pdsts header stuff here */ seq_puts(seq, "\n"); } static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq, int i, int *len) { seq_printf(seq, " kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ", kcm->index, kcm->stats.rx_msgs, kcm->stats.rx_bytes, kcm->stats.tx_msgs, kcm->stats.tx_bytes, kcm->sk.sk_receive_queue.qlen, sk_rmem_alloc_get(&kcm->sk), kcm->sk.sk_write_queue.qlen, "-"); if (kcm->tx_psock) seq_printf(seq, "Psck-%u ", kcm->tx_psock->index); if (kcm->tx_wait) seq_puts(seq, "TxWait "); if (kcm->tx_wait_more) seq_puts(seq, "WMore "); if (kcm->rx_wait) seq_puts(seq, "RxWait "); seq_puts(seq, "\n"); } static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, int i, int *len) { seq_printf(seq, " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", psock->index, psock->strp.stats.msgs, psock->strp.stats.bytes, psock->stats.tx_msgs, psock->stats.tx_bytes, psock->sk->sk_receive_queue.qlen, atomic_read(&psock->sk->sk_rmem_alloc), psock->sk->sk_write_queue.qlen, refcount_read(&psock->sk->sk_wmem_alloc)); if (psock->done) seq_puts(seq, "Done "); if (psock->tx_stopped) seq_puts(seq, "TxStop "); if (psock->strp.stopped) seq_puts(seq, "RxStop "); if (psock->tx_kcm) seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); if (!psock->strp.paused && !psock->ready_rx_msg) { if (psock->sk->sk_receive_queue.qlen) { if (psock->strp.need_bytes) seq_printf(seq, "RxWait=%u ", psock->strp.need_bytes); else seq_printf(seq, "RxWait "); } } else { if (psock->strp.paused) seq_puts(seq, "RxPause "); if (psock->ready_rx_msg) seq_puts(seq, "RdyRx "); } seq_puts(seq, "\n"); } static void kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq) { int i, len; struct kcm_sock *kcm; struct kcm_psock *psock; /* mux information */ seq_printf(seq, "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ", "mux", "", mux->stats.rx_msgs, mux->stats.rx_bytes, mux->stats.tx_msgs, mux->stats.tx_bytes, "-", "-", "-", "-"); seq_printf(seq, "KCMs: %d, Psocks %d\n", mux->kcm_socks_cnt, mux->psocks_cnt); /* kcm sock information */ i = 0; spin_lock_bh(&mux->lock); list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) { kcm_format_sock(kcm, seq, i, &len); i++; } i = 0; list_for_each_entry(psock, &mux->psocks, psock_list) { kcm_format_psock(psock, seq, i, &len); i++; } spin_unlock_bh(&mux->lock); } static int kcm_seq_show(struct seq_file *seq, void *v) { struct kcm_proc_mux_state *mux_state; mux_state = seq->private; if (v == SEQ_START_TOKEN) { mux_state->idx = 0; kcm_format_mux_header(seq); } else { kcm_format_mux(v, mux_state->idx, seq); mux_state->idx++; } return 0; } static const struct seq_operations kcm_seq_ops = { .show = kcm_seq_show, .start = kcm_seq_start, .next = kcm_seq_next, .stop = kcm_seq_stop, }; static int kcm_stats_seq_show(struct seq_file *seq, void *v) { struct kcm_psock_stats psock_stats; struct kcm_mux_stats mux_stats; struct strp_aggr_stats strp_stats; struct kcm_mux *mux; struct kcm_psock *psock; struct net *net = seq->private; struct kcm_net *knet = net_generic(net, kcm_net_id); memset(&mux_stats, 0, sizeof(mux_stats)); memset(&psock_stats, 0, sizeof(psock_stats)); memset(&strp_stats, 0, sizeof(strp_stats)); mutex_lock(&knet->mutex); aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); aggregate_psock_stats(&knet->aggregate_psock_stats, &psock_stats); aggregate_strp_stats(&knet->aggregate_strp_stats, &strp_stats); list_for_each_entry(mux, &knet->mux_list, kcm_mux_list) { spin_lock_bh(&mux->lock); aggregate_mux_stats(&mux->stats, &mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &psock_stats); aggregate_strp_stats(&mux->aggregate_strp_stats, &strp_stats); list_for_each_entry(psock, &mux->psocks, psock_list) { aggregate_psock_stats(&psock->stats, &psock_stats); save_strp_stats(&psock->strp, &strp_stats); } spin_unlock_bh(&mux->lock); } mutex_unlock(&knet->mutex); seq_printf(seq, "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n", "MUX", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "TX-Retries", "Attach", "Unattach", "UnattchRsvd", "RX-RdyDrops"); seq_printf(seq, "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n", "", mux_stats.rx_msgs, mux_stats.rx_bytes, mux_stats.tx_msgs, mux_stats.tx_bytes, mux_stats.tx_retries, mux_stats.psock_attach, mux_stats.psock_unattach_rsvd, mux_stats.psock_unattach, mux_stats.rx_ready_drops); seq_printf(seq, "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", "Psock", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "Reserved", "Unreserved", "RX-Aborts", "RX-Intr", "RX-Unrecov", "RX-MemFail", "RX-NeedMor", "RX-BadLen", "RX-TooBig", "RX-Timeout", "TX-Aborts"); seq_printf(seq, "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", "", strp_stats.msgs, strp_stats.bytes, psock_stats.tx_msgs, psock_stats.tx_bytes, psock_stats.reserved, psock_stats.unreserved, strp_stats.aborts, strp_stats.interrupted, strp_stats.unrecov_intr, strp_stats.mem_fail, strp_stats.need_more_hdr, strp_stats.bad_hdr_len, strp_stats.msg_too_big, strp_stats.msg_timeouts, psock_stats.tx_aborts); return 0; } static int kcm_proc_init_net(struct net *net) { if (!proc_create_net_single("kcm_stats", 0444, net->proc_net, kcm_stats_seq_show, NULL)) goto out_kcm_stats; if (!proc_create_net("kcm", 0444, net->proc_net, &kcm_seq_ops, sizeof(struct kcm_proc_mux_state))) goto out_kcm; return 0; out_kcm: remove_proc_entry("kcm_stats", net->proc_net); out_kcm_stats: return -ENOMEM; } static void kcm_proc_exit_net(struct net *net) { remove_proc_entry("kcm", net->proc_net); remove_proc_entry("kcm_stats", net->proc_net); } static struct pernet_operations kcm_net_ops = { .init = kcm_proc_init_net, .exit = kcm_proc_exit_net, }; int __init kcm_proc_init(void) { return register_pernet_subsys(&kcm_net_ops); } void __exit kcm_proc_exit(void) { unregister_pernet_subsys(&kcm_net_ops); } #endif /* CONFIG_PROC_FS */ |
| 21 19 26 26 21 19 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vmscan #if !defined(_TRACE_VMSCAN_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VMSCAN_H #include <linux/types.h> #include <linux/tracepoint.h> #include <linux/mm.h> #include <linux/memcontrol.h> #include <trace/events/mmflags.h> #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u #define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u /* Unused, all reclaim async */ #define RECLAIM_WB_ASYNC 0x0008u #define RECLAIM_WB_LRU (RECLAIM_WB_ANON|RECLAIM_WB_FILE) #define show_reclaim_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {RECLAIM_WB_ANON, "RECLAIM_WB_ANON"}, \ {RECLAIM_WB_FILE, "RECLAIM_WB_FILE"}, \ {RECLAIM_WB_MIXED, "RECLAIM_WB_MIXED"}, \ {RECLAIM_WB_SYNC, "RECLAIM_WB_SYNC"}, \ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) #define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) #define _VMSCAN_THROTTLE_NOPROGRESS (1 << VMSCAN_THROTTLE_NOPROGRESS) #define _VMSCAN_THROTTLE_CONGESTED (1 << VMSCAN_THROTTLE_CONGESTED) #define show_throttle_flags(flags) \ (flags) ? __print_flags(flags, "|", \ {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"}, \ {_VMSCAN_THROTTLE_NOPROGRESS, "VMSCAN_THROTTLE_NOPROGRESS"}, \ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), TP_ARGS(nid), TP_STRUCT__entry( __field( int, nid ) ), TP_fast_assign( __entry->nid = nid; ), TP_printk("nid=%d", __entry->nid) ); TRACE_EVENT(mm_vmscan_kswapd_wake, TP_PROTO(int nid, int zid, int order), TP_ARGS(nid, zid, order), TP_STRUCT__entry( __field( int, nid ) __field( int, zid ) __field( int, order ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; ), TP_printk("nid=%d order=%d", __entry->nid, __entry->order) ); TRACE_EVENT(mm_vmscan_wakeup_kswapd, TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), TP_ARGS(nid, zid, order, gfp_flags), TP_STRUCT__entry( __field( int, nid ) __field( int, zid ) __field( int, order ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->nid = nid; __entry->zid = zid; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags), TP_STRUCT__entry( __field( int, order ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("order=%d gfp_flags=%s", __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, TP_PROTO(int order, gfp_t gfp_flags), TP_ARGS(order, gfp_flags) ); #endif /* CONFIG_MEMCG */ DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed), TP_STRUCT__entry( __field( unsigned long, nr_reclaimed ) ), TP_fast_assign( __entry->nr_reclaimed = nr_reclaimed; ), TP_printk("nr_reclaimed=%lu", __entry->nr_reclaimed) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_direct_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); #endif /* CONFIG_MEMCG */ TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, long nr_objects_to_shrink, unsigned long cache_items, unsigned long long delta, unsigned long total_scan, int priority), TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, priority), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) __field(int, nid) __field(long, nr_objects_to_shrink) __field(unsigned long, gfp_flags) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) __field(int, priority) ), TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = (__force unsigned long)sc->gfp_mask; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; __entry->priority = priority; ), TP_printk("%pS %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->cache_items, __entry->delta, __entry->total_scan, __entry->priority) ); TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, long unused_scan_cnt, long new_scan_cnt, long total_scan), TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan), TP_STRUCT__entry( __field(struct shrinker *, shr) __field(int, nid) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) __field(int, retval) __field(long, total_scan) ), TP_fast_assign( __entry->shr = shr; __entry->nid = nid; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; __entry->retval = shrinker_retval; __entry->total_scan = total_scan; ), TP_printk("%pS %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, __entry->nid, __entry->unused_scan, __entry->new_scan, __entry->total_scan, __entry->retval) ); TRACE_EVENT(mm_vmscan_lru_isolate, TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, unsigned long nr_skipped, unsigned long nr_taken, int lru), TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, lru), TP_STRUCT__entry( __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) __field(int, lru) ), TP_fast_assign( __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; __entry->lru = lru; ), /* * classzone is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ TP_printk("classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, __entry->nr_skipped, __entry->nr_taken, __print_symbolic(__entry->lru, LRU_NAMES)) ); TRACE_EVENT(mm_vmscan_write_folio, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(unsigned long, pfn) __field(int, reclaim_flags) ), TP_fast_assign( __entry->pfn = folio_pfn(folio); __entry->reclaim_flags = trace_reclaim_flags( folio_is_file_lru(folio)); ), TP_printk("page=%p pfn=0x%lx flags=%s", pfn_to_page(__entry->pfn), __entry->pfn, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, unsigned long nr_scanned, unsigned long nr_reclaimed, struct reclaim_stat *stat, int priority, int file), TP_ARGS(nid, nr_scanned, nr_reclaimed, stat, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_scanned) __field(unsigned long, nr_reclaimed) __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) __field(unsigned int, nr_activate0) __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) __field(int, priority) __field(int, reclaim_flags) ), TP_fast_assign( __entry->nid = nid; __entry->nr_scanned = nr_scanned; __entry->nr_reclaimed = nr_reclaimed; __entry->nr_dirty = stat->nr_dirty; __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; __entry->nr_activate0 = stat->nr_activate[0]; __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, __entry->nr_activate0, __entry->nr_activate1, __entry->nr_ref_keep, __entry->nr_unmap_fail, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_lru_shrink_active, TP_PROTO(int nid, unsigned long nr_taken, unsigned long nr_active, unsigned long nr_deactivated, unsigned long nr_referenced, int priority, int file), TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file), TP_STRUCT__entry( __field(int, nid) __field(unsigned long, nr_taken) __field(unsigned long, nr_active) __field(unsigned long, nr_deactivated) __field(unsigned long, nr_referenced) __field(int, priority) __field(int, reclaim_flags) ), TP_fast_assign( __entry->nid = nid; __entry->nr_taken = nr_taken; __entry->nr_active = nr_active; __entry->nr_deactivated = nr_deactivated; __entry->nr_referenced = nr_referenced; __entry->priority = priority; __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", __entry->nid, __entry->nr_taken, __entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced, __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); TRACE_EVENT(mm_vmscan_node_reclaim_begin, TP_PROTO(int nid, int order, gfp_t gfp_flags), TP_ARGS(nid, order, gfp_flags), TP_STRUCT__entry( __field(int, nid) __field(int, order) __field(unsigned long, gfp_flags) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, TP_PROTO(unsigned long nr_reclaimed), TP_ARGS(nr_reclaimed) ); TRACE_EVENT(mm_vmscan_throttled, TP_PROTO(int nid, int usec_timeout, int usec_delayed, int reason), TP_ARGS(nid, usec_timeout, usec_delayed, reason), TP_STRUCT__entry( __field(int, nid) __field(int, usec_timeout) __field(int, usec_delayed) __field(int, reason) ), TP_fast_assign( __entry->nid = nid; __entry->usec_timeout = usec_timeout; __entry->usec_delayed = usec_delayed; __entry->reason = 1U << reason; ), TP_printk("nid=%d usec_timeout=%d usect_delayed=%d reason=%s", __entry->nid, __entry->usec_timeout, __entry->usec_delayed, show_throttle_flags(__entry->reason)) ); #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 880 20 882 2807 2807 1703 1074 1623 246 245 387 389 2511 2506 3095 3094 2851 4 3 2843 40 3 38 7 7 311 312 312 275 38 2497 2357 2489 1 1901 3 1899 4406 4418 1 1 1 403 403 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/userfaultfd_k.h> #include <linux/elf.h> #include <linux/elf-randomize.h> #include <linux/personality.h> #include <linux/random.h> #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <kunit/visibility.h> #include "internal.h" #include "swap.h" /** * kfree_const - conditionally free memory * @x: pointer to the memory * * Function calls kfree only if @x is not in .rodata section. */ void kfree_const(const void *x) { if (!is_kernel_rodata((unsigned long)x)) kfree(x); } EXPORT_SYMBOL(kfree_const); /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s or %NULL in case of error */ noinline char *kstrdup(const char *s, gfp_t gfp) { size_t len; char *buf; if (!s) return NULL; len = strlen(s) + 1; buf = kmalloc_track_caller(len, gfp); if (buf) memcpy(buf, s, len); return buf; } EXPORT_SYMBOL(kstrdup); /** * kstrdup_const - conditionally duplicate an existing const string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Strings allocated by kstrdup_const should be freed by kfree_const and * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return kstrdup(s, gfp); } EXPORT_SYMBOL(kstrdup_const); /** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. * * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { size_t len; char *buf; if (!s) return NULL; len = strnlen(s, max); buf = kmalloc_track_caller(len+1, gfp); if (buf) { memcpy(buf, s, len); buf[len] = '\0'; } return buf; } EXPORT_SYMBOL(kstrndup); /** * kmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) { void *p; p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kmemdup_noprof); /** * kmemdup_array - duplicate a given array. * * @src: array to duplicate. * @count: number of elements to duplicate from array. * @element_size: size of each element of array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } EXPORT_SYMBOL(kmemdup_array); /** * kvmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result may be not physically contiguous. Use kvfree() to free. */ void *kvmemdup(const void *src, size_t len, gfp_t gfp) { void *p; p = kvmalloc(len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kvmemdup); /** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s with NUL-termination or %NULL in * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { char *buf; if (!s) return NULL; buf = kmalloc_track_caller(len + 1, gfp); if (buf) { memcpy(buf, s, len); buf[len] = '\0'; } return buf; } EXPORT_SYMBOL(kmemdup_nul); static kmem_buckets *user_buckets __ro_after_init; static int __init init_user_buckets(void) { user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); return 0; } subsys_initcall(init_user_buckets); /** * memdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) { void *p; p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(memdup_user); /** * vmemdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) { void *p; p = kmem_buckets_valloc(user_buckets, len, GFP_USER); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kvfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(vmemdup_user); /** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { char *p; long length; length = strnlen_user(s, n); if (!length) return ERR_PTR(-EFAULT); if (length > n) return ERR_PTR(-EINVAL); p = memdup_user(s, length); if (IS_ERR(p)) return p; p[length - 1] = '\0'; return p; } EXPORT_SYMBOL(strndup_user); /** * memdup_user_nul - duplicate memory region from user space and NUL-terminate * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { char *p; /* * Always use GFP_KERNEL, since copy_from_user() can sleep and * cause pagefault, which makes it pointless to use GFP_NOFS * or GFP_ATOMIC. */ p = kmalloc_track_caller(len + 1, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } /* * Change backing file, only valid to use during initial VMA setup. */ void vma_set_file(struct vm_area_struct *vma, struct file *file) { /* Changing an anonymous vma with this is illegal */ get_file(file); swap(vma->vm_file, file); fput(file); } EXPORT_SYMBOL(vma_set_file); #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP return PAGE_ALIGN(stack_top) + random_variable; #else return PAGE_ALIGN(stack_top) - random_variable; #endif } /** * randomize_page - Generate a random, page aligned address * @start: The smallest acceptable address the caller will take. * @range: The size of the area, starting at @start, within which the * random address must fall. * * If @start + @range would overflow, @range is capped. * * NOTE: Historical use of randomize_range, which this replaces, presumed that * @start was already page aligned. We now align it regardless. * * Return: A page aligned address within [start, start + range). On error, * @start is returned. */ unsigned long randomize_page(unsigned long start, unsigned long range) { if (!PAGE_ALIGNED(start)) { range -= PAGE_ALIGN(start) - start; start = PAGE_ALIGN(start); } if (start > ULONG_MAX - range) range = ULONG_MAX - start; range >>= PAGE_SHIFT; if (range == 0) return start; return start + (get_random_long() % range << PAGE_SHIFT); } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT unsigned long __weak arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) return randomize_page(mm->brk, SZ_32M); return randomize_page(mm->brk, SZ_1G); } unsigned long arch_mmap_rnd(void) { unsigned long rnd; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS if (is_compat_task()) rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; /* On parisc the stack always grows up - so a unlimited stack should * not be an indicator to use the legacy memory layout. */ if (rlim_stack->rlim_cur == RLIM_INFINITY && !IS_ENABLED(CONFIG_STACK_GROWSUP)) return 1; return sysctl_legacy_va_layout; } /* * Leave enough space between the mmap area and the stack to honour ulimit in * the face of randomisation. */ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* * For an upwards growing stack the calculation is much simpler. * Memory for the maximum stack size is reserved at the top of the * task. mmap_base starts directly below the stack and grows * downwards. */ return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); #else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; /* Account for stack randomization if necessary */ if (current->flags & PF_RANDOMIZE) pad += (STACK_RND_MASK << PAGE_SHIFT); /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) gap += pad; if (gap < MIN_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); #endif } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); set_bit(MMF_TOPDOWN, &mm->flags); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; clear_bit(MMF_TOPDOWN, &mm->flags); } #endif #ifdef CONFIG_MMU EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); #endif /** * __account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * @task: task used to check RLIMIT_MEMLOCK * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped * * Assumes @task and @mm are valid (i.e. at least one reference on each), and * that mmap_lock is held as writer. * * Return: * * 0 on success * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; mmap_assert_write_locked(mm); locked_vm = mm->locked_vm; if (inc) { if (!bypass_rlim) { limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked_vm + pages > limit) ret = -ENOMEM; } if (!ret) mm->locked_vm = locked_vm + pages; } else { WARN_ON_ONCE(pages > locked_vm); mm->locked_vm = locked_vm - pages; } pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); return ret; } EXPORT_SYMBOL_GPL(__account_locked_vm); /** * account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against, may be NULL * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). * * Return: * * 0 on success, or if mm is NULL * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) { int ret; if (pages == 0 || !mm) return 0; mmap_write_lock(mm); ret = __account_locked_vm(mm, pages, inc, current, capable(CAP_IPC_LOCK)); mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL_GPL(account_locked_vm); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } return ret; } unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); /** * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @b: which set of kmalloc buckets to allocate from. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Return: pointer to the allocated memory of %NULL in case of failure */ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. * However make sure that larger requests are not too disruptive - no * OOM killer and no allocation failure warnings as we have a fallback. */ if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; /* nofail semantic is implemented by the vmalloc fallback */ kmalloc_flags &= ~__GFP_NOFAIL; } ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), kmalloc_flags, node); /* * It doesn't really make sense to fallback to vmalloc for sub page * requests */ if (ret || size <= PAGE_SIZE) return ret; /* non-sleeping allocations are not supported by vmalloc */ if (!gfpflags_allow_blocking(flags)) return NULL; /* Don't even allow crazy sizes */ if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; } /* * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, * since the callers already cannot assume anything * about the resulting pointer, and cannot play * protection games. */ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } EXPORT_SYMBOL(__kvmalloc_node_noprof); /** * kvfree() - Free memory. * @addr: Pointer to allocated memory. * * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) vfree(addr); else kfree(addr); } EXPORT_SYMBOL(kvfree); /** * kvfree_sensitive - Free a data object containing sensitive information. * @addr: address of the data object to be freed. * @len: length of the data object. * * Use the special memzero_explicit() function to clear the content of a * kvmalloc'ed object containing sensitive data to make sure that the * compiler won't optimize out the data clearing. */ void kvfree_sensitive(const void *addr, size_t len) { if (likely(!ZERO_OR_NULL_PTR(addr))) { memzero_explicit((void *)addr, len); kvfree(addr); } } EXPORT_SYMBOL(kvfree_sensitive); void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) { void *newp; if (oldsize >= newsize) return (void *)p; newp = kvmalloc_noprof(newsize, flags); if (!newp) return NULL; memcpy(newp, p, oldsize); kvfree(p); return newp; } EXPORT_SYMBOL(kvrealloc_noprof); /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return __vmalloc_noprof(bytes, flags); } EXPORT_SYMBOL(__vmalloc_array_noprof); /** * vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vmalloc_array_noprof(size_t n, size_t size) { return __vmalloc_array_noprof(n, size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_array_noprof); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); } EXPORT_SYMBOL(__vcalloc_noprof); /** * vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vcalloc_noprof(size_t n, size_t size) { return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vcalloc_noprof); struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) return NULL; return (void *)(mapping - PAGE_MAPPING_ANON); } /** * folio_mapping - Find the mapping where this folio is stored. * @folio: The folio. * * For folios which are in the page cache, return the mapping that this * page belongs to. Folios in the swap cache return the swap mapping * this page is stored in (which is different from the mapping for the * swap file or swap device where the data is stored). * * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ struct address_space *folio_mapping(struct folio *folio) { struct address_space *mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(folio_test_slab(folio))) return NULL; if (unlikely(folio_test_swapcache(folio))) return swap_address_space(folio->swap); mapping = folio->mapping; if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; return mapping; } EXPORT_SYMBOL(folio_mapping); /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. * @src: Folio to copy from. * * The bytes in the folio represented by @src are copied to @dst. * Assumes the caller has validated that @dst is at least as large as @src. * Can be called in atomic context for order-0 folios, but if the folio is * larger, it may sleep. */ void folio_copy(struct folio *dst, struct folio *src) { long i = 0; long nr = folio_nr_pages(src); for (;;) { copy_highpage(folio_page(dst, i), folio_page(src, i)); if (++i == nr) break; cond_resched(); } } EXPORT_SYMBOL(folio_copy); int folio_mc_copy(struct folio *dst, struct folio *src) { long nr = folio_nr_pages(src); long i = 0; for (;;) { if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i))) return -EHWPOISON; if (++i == nr) break; cond_resched(); } return 0; } EXPORT_SYMBOL(folio_mc_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_kbytes = 0; return ret; } static void sync_overcommit_as(struct work_struct *dummy) { percpu_counter_sync(&vm_committed_as); } int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; int ret; /* * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch * 2. sync percpu count on each CPU * 3. switch the policy */ if (write) { t = *table; t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || new_policy == -1) return ret; mm_compute_batch(new_policy); if (new_policy == OVERCOMMIT_NEVER) schedule_on_each_cpu(sync_overcommit_as); sysctl_overcommit_memory = new_policy; } else { ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } return ret; } int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_ratio = 0; return ret; } /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { unsigned long allowed; if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; return allowed; } /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. */ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; /* * The global memory commitment made in the system can be a metric * that can be used to drive ballooning decisions when Linux is hosted * as a guest. On Hyper-V, the host implements a policy engine for dynamically * balancing memory across competing virtual machines that are hosted. * Several metrics drive this policy engine including the guest reported * memory commitment. * * The time cost of this is very low for small platforms, and for big * platform like a 2S/36C/72T Skylake server, in worst case where * vm_committed_as's spinlock is under severe contention, the time cost * could be about 30~40 microseconds. */ unsigned long vm_memory_committed(void) { return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. * * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. * * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; unsigned long bytes_failed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { if (pages > totalram_pages() + total_swap_pages) goto error; return 0; } allowed = vm_commit_limit(); /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); /* * Don't let a single process grow so big a user can't recover */ if (mm) { long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); allowed -= min_t(long, mm->total_vm / 32, reserve); } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: bytes_failed = pages << PAGE_SHIFT; pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", __func__, current->pid, current->comm, bytes_failed); vm_unacct_memory(pages); return -ENOMEM; } /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. * * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) { int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); len = arg_end - arg_start; if (len > buflen) len = buflen; res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then * assume application is using setproctitle(3). */ if (res > 0 && buffer[res-1] != '\0' && len < buflen) { len = strnlen(buffer, res); if (len < res) { res = len; } else { len = env_end - env_start; if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, buffer+res, len, FOLL_FORCE); res = strnlen(buffer, res); } } out_mm: mmput(mm); out: return res; } int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; addr1 = kmap_local_page(page1); addr2 = kmap_local_page(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); kunmap_local(addr2); kunmap_local(addr1); return ret; } #ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information * @object: object for which to find provenance information. * * This function uses pr_cont(), so that the caller is expected to have * printed out whatever preamble is appropriate. The provenance information * depends on the type of object and on how much debugging is enabled. * For example, for a slab-cache object, the slab name is printed, and, * if available, the return address and stack trace from the allocation * and last free path of that object. */ void mem_dump_obj(void *object) { const char *type; if (kmem_dump_obj(object)) return; if (vmalloc_dump_obj(object)) return; if (is_vmalloc_addr(object)) type = "vmalloc memory"; else if (virt_addr_valid(object)) type = "non-slab/vmalloc memory"; else if (object == NULL) type = "NULL pointer"; else if (object == ZERO_SIZE_PTR) type = "zero-size pointer"; else type = "non-paged memory"; pr_cont(" %s\n", type); } EXPORT_SYMBOL_GPL(mem_dump_obj); #endif /* * A driver might set a page logically offline -- PageOffline() -- and * turn the page inaccessible in the hypervisor; after that, access to page * content can be fatal. * * Some special PFN walkers -- i.e., /proc/kcore -- read content of random * pages after checking PageOffline(); however, these PFN walkers can race * with drivers that set PageOffline(). * * page_offline_freeze()/page_offline_thaw() allows for a subsystem to * synchronize with such drivers, achieving that a page cannot be set * PageOffline() while frozen. * * page_offline_begin()/page_offline_end() is used by drivers that care about * such races when setting a page PageOffline(). */ static DECLARE_RWSEM(page_offline_rwsem); void page_offline_freeze(void) { down_read(&page_offline_rwsem); } void page_offline_thaw(void) { up_read(&page_offline_rwsem); } void page_offline_begin(void) { down_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_begin); void page_offline_end(void) { up_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_end); #ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio) { long i, nr = folio_nr_pages(folio); for (i = 0; i < nr; i++) flush_dcache_page(folio_page(folio, i)); } EXPORT_SYMBOL(flush_dcache_folio); #endif |
| 3 214 3 1 2 214 103 213 32 32 32 32 32 214 103 142 36 35 36 89 53 36 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_rmap.h" #include "xfs_ag.h" static struct kmem_cache *xfs_inobt_cur_cache; STATIC int xfs_inobt_get_minrecs( struct xfs_btree_cur *cur, int level) { return M_IGEO(cur->bc_mp)->inobt_mnr[level != 0]; } STATIC struct xfs_btree_cur * xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, cur->bc_ag.agbp); } STATIC struct xfs_btree_cur * xfs_finobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, cur->bc_ag.agbp); } STATIC void xfs_inobt_set_root( struct xfs_btree_cur *cur, const union xfs_btree_ptr *nptr, int inc) /* level change */ { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; agi->agi_root = nptr->s; be32_add_cpu(&agi->agi_level, inc); xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); } STATIC void xfs_finobt_set_root( struct xfs_btree_cur *cur, const union xfs_btree_ptr *nptr, int inc) /* level change */ { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; agi->agi_free_root = nptr->s; be32_add_cpu(&agi->agi_free_level, inc); xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); } /* Update the inode btree block counter for this btree. */ static inline void xfs_inobt_mod_blockcount( struct xfs_btree_cur *cur, int howmuch) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; if (!xfs_has_inobtcounts(cur->bc_mp)) return; if (xfs_btree_is_fino(cur->bc_ops)) be32_add_cpu(&agi->agi_fblocks, howmuch); else be32_add_cpu(&agi->agi_iblocks, howmuch); xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); } STATIC int __xfs_inobt_alloc_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *start, union xfs_btree_ptr *new, int *stat, enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ xfs_agblock_t sbno = be32_to_cpu(start->s); memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.pag = cur->bc_ag.pag; args.oinfo = XFS_RMAP_OINFO_INOBT; args.minlen = 1; args.maxlen = 1; args.prod = 1; args.resv = resv; error = xfs_alloc_vextent_near_bno(&args, XFS_AGB_TO_FSB(args.mp, args.pag->pag_agno, sbno)); if (error) return error; if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } ASSERT(args.len == 1); new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; xfs_inobt_mod_blockcount(cur, 1); return 0; } STATIC int xfs_inobt_alloc_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *start, union xfs_btree_ptr *new, int *stat) { return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); } STATIC int xfs_finobt_alloc_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *start, union xfs_btree_ptr *new, int *stat) { if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); } STATIC int __xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp, enum xfs_ag_resv_type resv) { xfs_fsblock_t fsbno; xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_INOBT, resv, 0); } STATIC int xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_NONE); } STATIC int xfs_finobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { if (cur->bc_mp->m_finobt_nores) return xfs_inobt_free_block(cur, bp); return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } STATIC int xfs_inobt_get_maxrecs( struct xfs_btree_cur *cur, int level) { return M_IGEO(cur->bc_mp)->inobt_mxr[level != 0]; } STATIC void xfs_inobt_init_key_from_rec( union xfs_btree_key *key, const union xfs_btree_rec *rec) { key->inobt.ir_startino = rec->inobt.ir_startino; } STATIC void xfs_inobt_init_high_key_from_rec( union xfs_btree_key *key, const union xfs_btree_rec *rec) { __u32 x; x = be32_to_cpu(rec->inobt.ir_startino); x += XFS_INODES_PER_CHUNK - 1; key->inobt.ir_startino = cpu_to_be32(x); } STATIC void xfs_inobt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); if (xfs_has_sparseinodes(cur->bc_mp)) { rec->inobt.ir_u.sp.ir_holemask = cpu_to_be16(cur->bc_rec.i.ir_holemask); rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount; } else { /* ir_holemask/ir_count not supported on-disk */ rec->inobt.ir_u.f.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); } rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } /* * initial value of ptr for lookup */ STATIC void xfs_inobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } STATIC void xfs_finobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_free_root; } STATIC int64_t xfs_inobt_key_diff( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { return (int64_t)be32_to_cpu(key->inobt.ir_startino) - cur->bc_rec.i.ir_startino; } STATIC int64_t xfs_inobt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, const union xfs_btree_key *mask) { ASSERT(!mask || mask->inobt.ir_startino); return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - be32_to_cpu(k2->inobt.ir_startino); } static xfs_failaddr_t xfs_inobt_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); xfs_failaddr_t fa; unsigned int level; if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; /* * During growfs operations, we can't verify the exact owner as the * perag is not fully initialised and hence not attached to the buffer. * * Similarly, during log recovery we will have a perag structure * attached, but the agi information will not yet have been initialised * from the on disk AGI. We don't currently use any of this information, * but beware of the landmine (i.e. need to check * xfs_perag_initialised_agi(pag)) if we ever do. */ if (xfs_has_crc(mp)) { fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; } /* level verification */ level = be16_to_cpu(block->bb_level); if (level >= M_IGEO(mp)->inobt_maxlevels) return __this_address; return xfs_btree_agblock_verify(bp, M_IGEO(mp)->inobt_mxr[level != 0]); } static void xfs_inobt_read_verify( struct xfs_buf *bp) { xfs_failaddr_t fa; if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_inobt_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } if (bp->b_error) trace_xfs_btree_corrupt(bp, _RET_IP_); } static void xfs_inobt_write_verify( struct xfs_buf *bp) { xfs_failaddr_t fa; fa = xfs_inobt_verify(bp); if (fa) { trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } xfs_btree_agblock_calc_crc(bp); } const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, }; const struct xfs_buf_ops xfs_finobt_buf_ops = { .name = "xfs_finobt", .magic = { cpu_to_be32(XFS_FIBT_MAGIC), cpu_to_be32(XFS_FIBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, }; STATIC int xfs_inobt_keys_inorder( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2) { return be32_to_cpu(k1->inobt.ir_startino) < be32_to_cpu(k2->inobt.ir_startino); } STATIC int xfs_inobt_recs_inorder( struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2) { return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= be32_to_cpu(r2->inobt.ir_startino); } STATIC enum xbtree_key_contig xfs_inobt_keys_contiguous( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { ASSERT(!mask || mask->inobt.ir_startino); return xbtree_key_contig(be32_to_cpu(key1->inobt.ir_startino), be32_to_cpu(key2->inobt.ir_startino)); } const struct xfs_btree_ops xfs_inobt_ops = { .name = "ino", .type = XFS_BTREE_TYPE_AG, .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_INO_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2), .sick_mask = XFS_SICK_AG_INOBT, .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_inobt_set_root, .alloc_block = xfs_inobt_alloc_block, .free_block = xfs_inobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, .keys_contiguous = xfs_inobt_keys_contiguous, }; const struct xfs_btree_ops xfs_finobt_ops = { .name = "fino", .type = XFS_BTREE_TYPE_AG, .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_INO_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2), .sick_mask = XFS_SICK_AG_FINOBT, .dup_cursor = xfs_finobt_dup_cursor, .set_root = xfs_finobt_set_root, .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_finobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, .init_key_from_rec = xfs_inobt_init_key_from_rec, .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_finobt_buf_ops, .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, .keys_contiguous = xfs_inobt_keys_contiguous, }; /* * Create an inode btree cursor. * * For staging cursors tp and agbp are NULL. */ struct xfs_btree_cur * xfs_inobt_init_cursor( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp) { struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; cur->bc_nlevels = be32_to_cpu(agi->agi_level); } return cur; } /* * Create a free inode btree cursor. * * For staging cursors tp and agbp are NULL. */ struct xfs_btree_cur * xfs_finobt_init_cursor( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp) { struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); } return cur; } /* * Install a new inobt btree root. Caller is responsible for invalidating * and freeing the old btree blocks. */ void xfs_inobt_commit_staged_btree( struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp) { struct xfs_agi *agi = agbp->b_addr; struct xbtree_afakeroot *afake = cur->bc_ag.afake; int fields; ASSERT(cur->bc_flags & XFS_BTREE_STAGING); if (xfs_btree_is_ino(cur->bc_ops)) { fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); if (xfs_has_inobtcounts(cur->bc_mp)) { agi->agi_iblocks = cpu_to_be32(afake->af_blocks); fields |= XFS_AGI_IBLOCKS; } xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp); } else { fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); agi->agi_free_level = cpu_to_be32(afake->af_levels); if (xfs_has_inobtcounts(cur->bc_mp)) { agi->agi_fblocks = cpu_to_be32(afake->af_blocks); fields |= XFS_AGI_IBLOCKS; } xfs_ialloc_log_agi(tp, agbp, fields); xfs_btree_commit_afakeroot(cur, tp, agbp); } } /* Calculate number of records in an inode btree block. */ static inline unsigned int xfs_inobt_block_maxrecs( unsigned int blocklen, bool leaf) { if (leaf) return blocklen / sizeof(xfs_inobt_rec_t); return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } /* * Calculate number of records in an inobt btree block. */ int xfs_inobt_maxrecs( struct xfs_mount *mp, int blocklen, int leaf) { blocklen -= XFS_INOBT_BLOCK_LEN(mp); return xfs_inobt_block_maxrecs(blocklen, leaf); } /* * Maximum number of inode btree records per AG. Pretend that we can fill an * entire AG completely full of inodes except for the AG headers. */ #define XFS_MAX_INODE_RECORDS \ ((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \ XFS_INODES_PER_CHUNK /* Compute the max possible height for the inode btree. */ static inline unsigned int xfs_inobt_maxlevels_ondisk(void) { unsigned int minrecs[2]; unsigned int blocklen; blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN, XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN); minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2; minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2; return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS); } /* Compute the max possible height for the free inode btree. */ static inline unsigned int xfs_finobt_maxlevels_ondisk(void) { unsigned int minrecs[2]; unsigned int blocklen; blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2; minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2; return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS); } /* Compute the max possible height for either inode btree. */ unsigned int xfs_iallocbt_maxlevels_ondisk(void) { return max(xfs_inobt_maxlevels_ondisk(), xfs_finobt_maxlevels_ondisk()); } /* * Convert the inode record holemask to an inode allocation bitmap. The inode * allocation bitmap is inode granularity and specifies whether an inode is * physically allocated on disk (not whether the inode is considered allocated * or free by the fs). * * A bit value of 1 means the inode is allocated, a value of 0 means it is free. */ uint64_t xfs_inobt_irec_to_allocmask( const struct xfs_inobt_rec_incore *rec) { uint64_t bitmap = 0; uint64_t inodespbit; int nextbit; uint allocbitmap; /* * The holemask has 16-bits for a 64 inode record. Therefore each * holemask bit represents multiple inodes. Create a mask of bits to set * in the allocmask for each holemask bit. */ inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1; /* * Allocated inodes are represented by 0 bits in holemask. Invert the 0 * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask * anything beyond the 16 holemask bits since this casts to a larger * type. */ allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1); /* * allocbitmap is the inverted holemask so every set bit represents * allocated inodes. To expand from 16-bit holemask granularity to * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target * bitmap for every holemask bit. */ nextbit = xfs_next_bit(&allocbitmap, 1, 0); while (nextbit != -1) { ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY)); bitmap |= (inodespbit << (nextbit * XFS_INODES_PER_HOLEMASK_BIT)); nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1); } return bitmap; } #if defined(DEBUG) || defined(XFS_WARN) /* * Verify that an in-core inode record has a valid inode count. */ int xfs_inobt_rec_check_count( struct xfs_mount *mp, struct xfs_inobt_rec_incore *rec) { int inocount = 0; int nextbit = 0; uint64_t allocbmap; int wordsz; wordsz = sizeof(allocbmap) / sizeof(unsigned int); allocbmap = xfs_inobt_irec_to_allocmask(rec); nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); while (nextbit != -1) { inocount++; nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit + 1); } if (inocount != rec->ir_count) return -EFSCORRUPTED; return 0; } #endif /* DEBUG */ static xfs_extlen_t xfs_inobt_max_size( struct xfs_perag *pag) { struct xfs_mount *mp = pag->pag_mount; xfs_agblock_t agblocks = pag->block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) return 0; /* * The log is permanently allocated, so the space it occupies will * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, (uint64_t)agblocks * mp->m_sb.sb_inopblock / XFS_INODES_PER_CHUNK); } static int xfs_finobt_count_blocks( struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp = NULL; struct xfs_btree_cur *cur; int error; error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); return error; } /* Read finobt block count from AGI header. */ static int xfs_finobt_read_blocks( struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp; struct xfs_agi *agi; int error; error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) return error; agi = agbp->b_addr; *tree_blocks = be32_to_cpu(agi->agi_fblocks); xfs_trans_brelse(tp, agbp); return 0; } /* * Figure out how many blocks to reserve and how many are used by this btree. */ int xfs_finobt_calc_reserves( struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t *ask, xfs_extlen_t *used) { xfs_extlen_t tree_len = 0; int error; if (!xfs_has_finobt(pag->pag_mount)) return 0; if (xfs_has_inobtcounts(pag->pag_mount)) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else error = xfs_finobt_count_blocks(pag, tp, &tree_len); if (error) return error; *ask += xfs_inobt_max_size(pag); *used += tree_len; return 0; } /* Calculate the inobt btree size for some records. */ xfs_extlen_t xfs_iallocbt_calc_size( struct xfs_mount *mp, unsigned long long len) { return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len); } int __init xfs_inobt_init_cur_cache(void) { xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur", xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()), 0, 0, NULL); if (!xfs_inobt_cur_cache) return -ENOMEM; return 0; } void xfs_inobt_destroy_cur_cache(void) { kmem_cache_destroy(xfs_inobt_cur_cache); xfs_inobt_cur_cache = NULL; } |
| 345 343 346 7 7 6 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic Timer-queue * * Manages a simple queue of timers, ordered by expiration time. * Uses rbtrees for quick list adds and expiration. * * NOTE: All of the following functions need to be serialized * to avoid races. No locking is done by this library code. */ #include <linux/bug.h> #include <linux/timerqueue.h> #include <linux/rbtree.h> #include <linux/export.h> #define __node_2_tq(_n) \ rb_entry((_n), struct timerqueue_node, node) static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b) { return __node_2_tq(a)->expires < __node_2_tq(b)->expires; } /** * timerqueue_add - Adds timer to timerqueue. * * @head: head of timerqueue * @node: timer node to be added * * Adds the timer node to the timerqueue, sorted by the node's expires * value. Returns true if the newly added timer is the first expiring timer in * the queue. */ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { /* Make sure we don't add nodes that are already added */ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less); } EXPORT_SYMBOL_GPL(timerqueue_add); /** * timerqueue_del - Removes a timer from the timerqueue. * * @head: head of timerqueue * @node: timer node to be removed * * Removes the timer node from the timerqueue. Returns true if the queue is * not empty after the remove. */ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); rb_erase_cached(&node->node, &head->rb_root); RB_CLEAR_NODE(&node->node); return !RB_EMPTY_ROOT(&head->rb_root.rb_root); } EXPORT_SYMBOL_GPL(timerqueue_del); /** * timerqueue_iterate_next - Returns the timer after the provided timer * * @node: Pointer to a timer. * * Provides the timer that is after the given node. This is used, when * necessary, to iterate through the list of timers in a timer list * without modifying the list. */ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) { struct rb_node *next; if (!node) return NULL; next = rb_next(&node->node); if (!next) return NULL; return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next); |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dlm #if !defined(_TRACE_DLM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DLM_H #include <linux/dlm.h> #include <linux/dlmconstants.h> #include <uapi/linux/dlm_plock.h> #include <linux/tracepoint.h> #include "../../../fs/dlm/dlm_internal.h" #define show_lock_flags(flags) __print_flags(flags, "|", \ { DLM_LKF_NOQUEUE, "NOQUEUE" }, \ { DLM_LKF_CANCEL, "CANCEL" }, \ { DLM_LKF_CONVERT, "CONVERT" }, \ { DLM_LKF_VALBLK, "VALBLK" }, \ { DLM_LKF_QUECVT, "QUECVT" }, \ { DLM_LKF_IVVALBLK, "IVVALBLK" }, \ { DLM_LKF_CONVDEADLK, "CONVDEADLK" }, \ { DLM_LKF_PERSISTENT, "PERSISTENT" }, \ { DLM_LKF_NODLCKWT, "NODLCKWT" }, \ { DLM_LKF_NODLCKBLK, "NODLCKBLK" }, \ { DLM_LKF_EXPEDITE, "EXPEDITE" }, \ { DLM_LKF_NOQUEUEBAST, "NOQUEUEBAST" }, \ { DLM_LKF_HEADQUE, "HEADQUE" }, \ { DLM_LKF_NOORDER, "NOORDER" }, \ { DLM_LKF_ORPHAN, "ORPHAN" }, \ { DLM_LKF_ALTPR, "ALTPR" }, \ { DLM_LKF_ALTCW, "ALTCW" }, \ { DLM_LKF_FORCEUNLOCK, "FORCEUNLOCK" }, \ { DLM_LKF_TIMEOUT, "TIMEOUT" }) #define show_lock_mode(mode) __print_symbolic(mode, \ { DLM_LOCK_IV, "IV"}, \ { DLM_LOCK_NL, "NL"}, \ { DLM_LOCK_CR, "CR"}, \ { DLM_LOCK_CW, "CW"}, \ { DLM_LOCK_PR, "PR"}, \ { DLM_LOCK_PW, "PW"}, \ { DLM_LOCK_EX, "EX"}) #define show_dlm_sb_flags(flags) __print_flags(flags, "|", \ { DLM_SBF_DEMOTED, "DEMOTED" }, \ { DLM_SBF_VALNOTVALID, "VALNOTVALID" }, \ { DLM_SBF_ALTMODE, "ALTMODE" }) #define show_lkb_flags(flags) __print_flags(flags, "|", \ { BIT(DLM_DFL_USER_BIT), "USER" }, \ { BIT(DLM_DFL_ORPHAN_BIT), "ORPHAN" }) #define show_header_cmd(cmd) __print_symbolic(cmd, \ { DLM_MSG, "MSG"}, \ { DLM_RCOM, "RCOM"}, \ { DLM_OPTS, "OPTS"}, \ { DLM_ACK, "ACK"}, \ { DLM_FIN, "FIN"}) #define show_message_version(version) __print_symbolic(version, \ { DLM_VERSION_3_1, "3.1"}, \ { DLM_VERSION_3_2, "3.2"}) #define show_message_type(type) __print_symbolic(type, \ { DLM_MSG_REQUEST, "REQUEST"}, \ { DLM_MSG_CONVERT, "CONVERT"}, \ { DLM_MSG_UNLOCK, "UNLOCK"}, \ { DLM_MSG_CANCEL, "CANCEL"}, \ { DLM_MSG_REQUEST_REPLY, "REQUEST_REPLY"}, \ { DLM_MSG_CONVERT_REPLY, "CONVERT_REPLY"}, \ { DLM_MSG_UNLOCK_REPLY, "UNLOCK_REPLY"}, \ { DLM_MSG_CANCEL_REPLY, "CANCEL_REPLY"}, \ { DLM_MSG_GRANT, "GRANT"}, \ { DLM_MSG_BAST, "BAST"}, \ { DLM_MSG_LOOKUP, "LOOKUP"}, \ { DLM_MSG_REMOVE, "REMOVE"}, \ { DLM_MSG_LOOKUP_REPLY, "LOOKUP_REPLY"}, \ { DLM_MSG_PURGE, "PURGE"}) #define show_rcom_type(type) __print_symbolic(type, \ { DLM_RCOM_STATUS, "STATUS"}, \ { DLM_RCOM_NAMES, "NAMES"}, \ { DLM_RCOM_LOOKUP, "LOOKUP"}, \ { DLM_RCOM_LOCK, "LOCK"}, \ { DLM_RCOM_STATUS_REPLY, "STATUS_REPLY"}, \ { DLM_RCOM_NAMES_REPLY, "NAMES_REPLY"}, \ { DLM_RCOM_LOOKUP_REPLY, "LOOKUP_REPLY"}, \ { DLM_RCOM_LOCK_REPLY, "LOCK_REPLY"}) /* note: we begin tracing dlm_lock_start() only if ls and lkb are found */ TRACE_EVENT(dlm_lock_start, TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, const void *name, unsigned int namelen, int mode, __u32 flags), TP_ARGS(ls, lkb, name, namelen, mode, flags), TP_STRUCT__entry( __field(__u32, ls_id) __field(__u32, lkb_id) __field(int, mode) __field(__u32, flags) __dynamic_array(unsigned char, res_name, lkb->lkb_resource ? lkb->lkb_resource->res_length : namelen) ), TP_fast_assign( struct dlm_rsb *r; __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->mode = mode; __entry->flags = flags; r = lkb->lkb_resource; if (r) memcpy(__get_dynamic_array(res_name), r->res_name, __get_dynamic_array_len(res_name)); else if (name) memcpy(__get_dynamic_array(res_name), name, __get_dynamic_array_len(res_name)); ), TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s res_name=%s", __entry->ls_id, __entry->lkb_id, show_lock_mode(__entry->mode), show_lock_flags(__entry->flags), __print_hex_str(__get_dynamic_array(res_name), __get_dynamic_array_len(res_name))) ); TRACE_EVENT(dlm_lock_end, TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, const void *name, unsigned int namelen, int mode, __u32 flags, int error, bool kernel_lock), TP_ARGS(ls, lkb, name, namelen, mode, flags, error, kernel_lock), TP_STRUCT__entry( __field(__u32, ls_id) __field(__u32, lkb_id) __field(int, mode) __field(__u32, flags) __field(int, error) __dynamic_array(unsigned char, res_name, lkb->lkb_resource ? lkb->lkb_resource->res_length : namelen) ), TP_fast_assign( struct dlm_rsb *r; __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->mode = mode; __entry->flags = flags; __entry->error = error; r = lkb->lkb_resource; if (r) memcpy(__get_dynamic_array(res_name), r->res_name, __get_dynamic_array_len(res_name)); else if (name) memcpy(__get_dynamic_array(res_name), name, __get_dynamic_array_len(res_name)); if (kernel_lock) { /* return value will be zeroed in those cases by dlm_lock() * we do it here again to not introduce more overhead if * trace isn't running and error reflects the return value. */ if (error == -EAGAIN || error == -EDEADLK) __entry->error = 0; } ), TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s error=%d res_name=%s", __entry->ls_id, __entry->lkb_id, show_lock_mode(__entry->mode), show_lock_flags(__entry->flags), __entry->error, __print_hex_str(__get_dynamic_array(res_name), __get_dynamic_array_len(res_name))) ); TRACE_EVENT(dlm_bast, TP_PROTO(__u32 ls_id, __u32 lkb_id, int mode, const char *res_name, size_t res_length), TP_ARGS(ls_id, lkb_id, mode, res_name, res_length), TP_STRUCT__entry( __field(__u32, ls_id) __field(__u32, lkb_id) __field(int, mode) __dynamic_array(unsigned char, res_name, res_length) ), TP_fast_assign( __entry->ls_id = ls_id; __entry->lkb_id = lkb_id; __entry->mode = mode; memcpy(__get_dynamic_array(res_name), res_name, __get_dynamic_array_len(res_name)); ), TP_printk("ls_id=%u lkb_id=%x mode=%s res_name=%s", __entry->ls_id, __entry->lkb_id, show_lock_mode(__entry->mode), __print_hex_str(__get_dynamic_array(res_name), __get_dynamic_array_len(res_name))) ); TRACE_EVENT(dlm_ast, TP_PROTO(__u32 ls_id, __u32 lkb_id, __u8 sb_flags, int sb_status, const char *res_name, size_t res_length), TP_ARGS(ls_id, lkb_id, sb_flags, sb_status, res_name, res_length), TP_STRUCT__entry( __field(__u32, ls_id) __field(__u32, lkb_id) __field(__u8, sb_flags) __field(int, sb_status) __dynamic_array(unsigned char, res_name, res_length) ), TP_fast_assign( __entry->ls_id = ls_id; __entry->lkb_id = lkb_id; __entry->sb_flags = sb_flags; __entry->sb_status = sb_status; memcpy(__get_dynamic_array(res_name), res_name, __get_dynamic_array_len(res_name)); ), TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d res_name=%s", __entry->ls_id, __entry->lkb_id, show_dlm_sb_flags(__entry->sb_flags), __entry->sb_status, __print_hex_str(__get_dynamic_array(res_name), __get_dynamic_array_len(res_name))) ); /* note: we begin tracing dlm_unlock_start() only if ls and lkb are found */ TRACE_EVENT(dlm_unlock_start, TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, __u32 flags), TP_ARGS(ls, lkb, flags), TP_STRUCT__entry( __field(__u32, ls_id) __field(__u32, lkb_id) __field(__u32, flags) __dynamic_array(unsigned char, res_name, lkb->lkb_resource ? lkb->lkb_resource->res_length : 0) ), TP_fast_assign( struct dlm_rsb *r; __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->flags = flags; r = lkb->lkb_resource; if (r) memcpy(__get_dynamic_array(res_name), r->res_name, __get_dynamic_array_len(res_name)); ), TP_printk("ls_id=%u lkb_id=%x flags=%s res_name=%s", __entry->ls_id, __entry->lkb_id, show_lock_flags(__entry->flags), __print_hex_str(__get_dynamic_array(res_name), __get_dynamic_array_len(res_name))) ); TRACE_EVENT(dlm_unlock_end, TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, __u32 flags, int error), TP_ARGS(ls, lkb, flags, error), TP_STRUCT__entry( __field(__u32, ls_id) __field(__u32, lkb_id) __field(__u32, flags) __field(int, error) __dynamic_array(unsigned char, res_name, lkb->lkb_resource ? lkb->lkb_resource->res_length : 0) ), TP_fast_assign( struct dlm_rsb *r; __entry->ls_id = ls->ls_global_id; __entry->lkb_id = lkb->lkb_id; __entry->flags = flags; __entry->error = error; r = lkb->lkb_resource; if (r) memcpy(__get_dynamic_array(res_name), r->res_name, __get_dynamic_array_len(res_name)); ), TP_printk("ls_id=%u lkb_id=%x flags=%s error=%d res_name=%s", __entry->ls_id, __entry->lkb_id, show_lock_flags(__entry->flags), __entry->error, __print_hex_str(__get_dynamic_array(res_name), __get_dynamic_array_len(res_name))) ); DECLARE_EVENT_CLASS(dlm_rcom_template, TP_PROTO(uint32_t dst, uint32_t h_seq, const struct dlm_rcom *rc), TP_ARGS(dst, h_seq, rc), TP_STRUCT__entry( __field(uint32_t, dst) __field(uint32_t, h_seq) __field(uint32_t, h_version) __field(uint32_t, h_lockspace) __field(uint32_t, h_nodeid) __field(uint16_t, h_length) __field(uint8_t, h_cmd) __field(uint32_t, rc_type) __field(int32_t, rc_result) __field(uint64_t, rc_id) __field(uint64_t, rc_seq) __field(uint64_t, rc_seq_reply) __dynamic_array(unsigned char, rc_buf, le16_to_cpu(rc->rc_header.h_length) - sizeof(*rc)) ), TP_fast_assign( __entry->dst = dst; __entry->h_seq = h_seq; __entry->h_version = le32_to_cpu(rc->rc_header.h_version); __entry->h_lockspace = le32_to_cpu(rc->rc_header.u.h_lockspace); __entry->h_nodeid = le32_to_cpu(rc->rc_header.h_nodeid); __entry->h_length = le16_to_cpu(rc->rc_header.h_length); __entry->h_cmd = rc->rc_header.h_cmd; __entry->rc_type = le32_to_cpu(rc->rc_type); __entry->rc_result = le32_to_cpu(rc->rc_result); __entry->rc_id = le64_to_cpu(rc->rc_id); __entry->rc_seq = le64_to_cpu(rc->rc_seq); __entry->rc_seq_reply = le64_to_cpu(rc->rc_seq_reply); memcpy(__get_dynamic_array(rc_buf), rc->rc_buf, __get_dynamic_array_len(rc_buf)); ), TP_printk("dst=%u h_seq=%u h_version=%s h_lockspace=%u h_nodeid=%u " "h_length=%u h_cmd=%s rc_type=%s rc_result=%d " "rc_id=%llu rc_seq=%llu rc_seq_reply=%llu " "rc_buf=0x%s", __entry->dst, __entry->h_seq, show_message_version(__entry->h_version), __entry->h_lockspace, __entry->h_nodeid, __entry->h_length, show_header_cmd(__entry->h_cmd), show_rcom_type(__entry->rc_type), __entry->rc_result, __entry->rc_id, __entry->rc_seq, __entry->rc_seq_reply, __print_hex_str(__get_dynamic_array(rc_buf), __get_dynamic_array_len(rc_buf))) ); DEFINE_EVENT(dlm_rcom_template, dlm_send_rcom, TP_PROTO(uint32_t dst, uint32_t h_seq, const struct dlm_rcom *rc), TP_ARGS(dst, h_seq, rc)); DEFINE_EVENT(dlm_rcom_template, dlm_recv_rcom, TP_PROTO(uint32_t dst, uint32_t h_seq, const struct dlm_rcom *rc), TP_ARGS(dst, h_seq, rc)); TRACE_EVENT(dlm_send_message, TP_PROTO(uint32_t dst, uint32_t h_seq, const struct dlm_message *ms, const void *name, int namelen), TP_ARGS(dst, h_seq, ms, name, namelen), TP_STRUCT__entry( __field(uint32_t, dst) __field(uint32_t, h_seq) __field(uint32_t, h_version) __field(uint32_t, h_lockspace) __field(uint32_t, h_nodeid) __field(uint16_t, h_length) __field(uint8_t, h_cmd) __field(uint32_t, m_type) __field(uint32_t, m_nodeid) __field(uint32_t, m_pid) __field(uint32_t, m_lkid) __field(uint32_t, m_remid) __field(uint32_t, m_parent_lkid) __field(uint32_t, m_parent_remid) __field(uint32_t, m_exflags) __field(uint32_t, m_sbflags) __field(uint32_t, m_flags) __field(uint32_t, m_lvbseq) __field(uint32_t, m_hash) __field(int32_t, m_status) __field(int32_t, m_grmode) __field(int32_t, m_rqmode) __field(int32_t, m_bastmode) __field(int32_t, m_asts) __field(int32_t, m_result) __dynamic_array(unsigned char, m_extra, le16_to_cpu(ms->m_header.h_length) - sizeof(*ms)) __dynamic_array(unsigned char, res_name, namelen) ), TP_fast_assign( __entry->dst = dst; __entry->h_seq = h_seq; __entry->h_version = le32_to_cpu(ms->m_header.h_version); __entry->h_lockspace = le32_to_cpu(ms->m_header.u.h_lockspace); __entry->h_nodeid = le32_to_cpu(ms->m_header.h_nodeid); __entry->h_length = le16_to_cpu(ms->m_header.h_length); __entry->h_cmd = ms->m_header.h_cmd; __entry->m_type = le32_to_cpu(ms->m_type); __entry->m_nodeid = le32_to_cpu(ms->m_nodeid); __entry->m_pid = le32_to_cpu(ms->m_pid); __entry->m_lkid = le32_to_cpu(ms->m_lkid); __entry->m_remid = le32_to_cpu(ms->m_remid); __entry->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid); __entry->m_parent_remid = le32_to_cpu(ms->m_parent_remid); __entry->m_exflags = le32_to_cpu(ms->m_exflags); __entry->m_sbflags = le32_to_cpu(ms->m_sbflags); __entry->m_flags = le32_to_cpu(ms->m_flags); __entry->m_lvbseq = le32_to_cpu(ms->m_lvbseq); __entry->m_hash = le32_to_cpu(ms->m_hash); __entry->m_status = le32_to_cpu(ms->m_status); __entry->m_grmode = le32_to_cpu(ms->m_grmode); __entry->m_rqmode = le32_to_cpu(ms->m_rqmode); __entry->m_bastmode = le32_to_cpu(ms->m_bastmode); __entry->m_asts = le32_to_cpu(ms->m_asts); __entry->m_result = le32_to_cpu(ms->m_result); memcpy(__get_dynamic_array(m_extra), ms->m_extra, __get_dynamic_array_len(m_extra)); memcpy(__get_dynamic_array(res_name), name, __get_dynamic_array_len(res_name)); ), TP_printk("dst=%u h_seq=%u h_version=%s h_lockspace=%u h_nodeid=%u " "h_length=%u h_cmd=%s m_type=%s m_nodeid=%u " "m_pid=%u m_lkid=%u m_remid=%u m_parent_lkid=%u " "m_parent_remid=%u m_exflags=%s m_sbflags=%s m_flags=%s " "m_lvbseq=%u m_hash=%u m_status=%d m_grmode=%s " "m_rqmode=%s m_bastmode=%s m_asts=%d m_result=%d " "m_extra=0x%s res_name=0x%s", __entry->dst, __entry->h_seq, show_message_version(__entry->h_version), __entry->h_lockspace, __entry->h_nodeid, __entry->h_length, show_header_cmd(__entry->h_cmd), show_message_type(__entry->m_type), __entry->m_nodeid, __entry->m_pid, __entry->m_lkid, __entry->m_remid, __entry->m_parent_lkid, __entry->m_parent_remid, show_lock_flags(__entry->m_exflags), show_dlm_sb_flags(__entry->m_sbflags), show_lkb_flags(__entry->m_flags), __entry->m_lvbseq, __entry->m_hash, __entry->m_status, show_lock_mode(__entry->m_grmode), show_lock_mode(__entry->m_rqmode), show_lock_mode(__entry->m_bastmode), __entry->m_asts, __entry->m_result, __print_hex_str(__get_dynamic_array(m_extra), __get_dynamic_array_len(m_extra)), __print_hex_str(__get_dynamic_array(res_name), __get_dynamic_array_len(res_name))) ); TRACE_EVENT(dlm_recv_message, TP_PROTO(uint32_t dst, uint32_t h_seq, const struct dlm_message *ms), TP_ARGS(dst, h_seq, ms), TP_STRUCT__entry( __field(uint32_t, dst) __field(uint32_t, h_seq) __field(uint32_t, h_version) __field(uint32_t, h_lockspace) __field(uint32_t, h_nodeid) __field(uint16_t, h_length) __field(uint8_t, h_cmd) __field(uint32_t, m_type) __field(uint32_t, m_nodeid) __field(uint32_t, m_pid) __field(uint32_t, m_lkid) __field(uint32_t, m_remid) __field(uint32_t, m_parent_lkid) __field(uint32_t, m_parent_remid) __field(uint32_t, m_exflags) __field(uint32_t, m_sbflags) __field(uint32_t, m_flags) __field(uint32_t, m_lvbseq) __field(uint32_t, m_hash) __field(int32_t, m_status) __field(int32_t, m_grmode) __field(int32_t, m_rqmode) __field(int32_t, m_bastmode) __field(int32_t, m_asts) __field(int32_t, m_result) __dynamic_array(unsigned char, m_extra, le16_to_cpu(ms->m_header.h_length) - sizeof(*ms)) ), TP_fast_assign( __entry->dst = dst; __entry->h_seq = h_seq; __entry->h_version = le32_to_cpu(ms->m_header.h_version); __entry->h_lockspace = le32_to_cpu(ms->m_header.u.h_lockspace); __entry->h_nodeid = le32_to_cpu(ms->m_header.h_nodeid); __entry->h_length = le16_to_cpu(ms->m_header.h_length); __entry->h_cmd = ms->m_header.h_cmd; __entry->m_type = le32_to_cpu(ms->m_type); __entry->m_nodeid = le32_to_cpu(ms->m_nodeid); __entry->m_pid = le32_to_cpu(ms->m_pid); __entry->m_lkid = le32_to_cpu(ms->m_lkid); __entry->m_remid = le32_to_cpu(ms->m_remid); __entry->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid); __entry->m_parent_remid = le32_to_cpu(ms->m_parent_remid); __entry->m_exflags = le32_to_cpu(ms->m_exflags); __entry->m_sbflags = le32_to_cpu(ms->m_sbflags); __entry->m_flags = le32_to_cpu(ms->m_flags); __entry->m_lvbseq = le32_to_cpu(ms->m_lvbseq); __entry->m_hash = le32_to_cpu(ms->m_hash); __entry->m_status = le32_to_cpu(ms->m_status); __entry->m_grmode = le32_to_cpu(ms->m_grmode); __entry->m_rqmode = le32_to_cpu(ms->m_rqmode); __entry->m_bastmode = le32_to_cpu(ms->m_bastmode); __entry->m_asts = le32_to_cpu(ms->m_asts); __entry->m_result = le32_to_cpu(ms->m_result); memcpy(__get_dynamic_array(m_extra), ms->m_extra, __get_dynamic_array_len(m_extra)); ), TP_printk("dst=%u h_seq=%u h_version=%s h_lockspace=%u h_nodeid=%u " "h_length=%u h_cmd=%s m_type=%s m_nodeid=%u " "m_pid=%u m_lkid=%u m_remid=%u m_parent_lkid=%u " "m_parent_remid=%u m_exflags=%s m_sbflags=%s m_flags=%s " "m_lvbseq=%u m_hash=%u m_status=%d m_grmode=%s " "m_rqmode=%s m_bastmode=%s m_asts=%d m_result=%d " "m_extra=0x%s", __entry->dst, __entry->h_seq, show_message_version(__entry->h_version), __entry->h_lockspace, __entry->h_nodeid, __entry->h_length, show_header_cmd(__entry->h_cmd), show_message_type(__entry->m_type), __entry->m_nodeid, __entry->m_pid, __entry->m_lkid, __entry->m_remid, __entry->m_parent_lkid, __entry->m_parent_remid, show_lock_flags(__entry->m_exflags), show_dlm_sb_flags(__entry->m_sbflags), show_lkb_flags(__entry->m_flags), __entry->m_lvbseq, __entry->m_hash, __entry->m_status, show_lock_mode(__entry->m_grmode), show_lock_mode(__entry->m_rqmode), show_lock_mode(__entry->m_bastmode), __entry->m_asts, __entry->m_result, __print_hex_str(__get_dynamic_array(m_extra), __get_dynamic_array_len(m_extra))) ); DECLARE_EVENT_CLASS(dlm_plock_template, TP_PROTO(const struct dlm_plock_info *info), TP_ARGS(info), TP_STRUCT__entry( __field(uint8_t, optype) __field(uint8_t, ex) __field(uint8_t, wait) __field(uint8_t, flags) __field(uint32_t, pid) __field(int32_t, nodeid) __field(int32_t, rv) __field(uint32_t, fsid) __field(uint64_t, number) __field(uint64_t, start) __field(uint64_t, end) __field(uint64_t, owner) ), TP_fast_assign( __entry->optype = info->optype; __entry->ex = info->ex; __entry->wait = info->wait; __entry->flags = info->flags; __entry->pid = info->pid; __entry->nodeid = info->nodeid; __entry->rv = info->rv; __entry->fsid = info->fsid; __entry->number = info->number; __entry->start = info->start; __entry->end = info->end; __entry->owner = info->owner; ), TP_printk("fsid=%u number=%llx owner=%llx optype=%d ex=%d wait=%d flags=%x pid=%u nodeid=%d rv=%d start=%llx end=%llx", __entry->fsid, __entry->number, __entry->owner, __entry->optype, __entry->ex, __entry->wait, __entry->flags, __entry->pid, __entry->nodeid, __entry->rv, __entry->start, __entry->end) ); DEFINE_EVENT(dlm_plock_template, dlm_plock_read, TP_PROTO(const struct dlm_plock_info *info), TP_ARGS(info)); DEFINE_EVENT(dlm_plock_template, dlm_plock_write, TP_PROTO(const struct dlm_plock_info *info), TP_ARGS(info)); TRACE_EVENT(dlm_send, TP_PROTO(int nodeid, int ret), TP_ARGS(nodeid, ret), TP_STRUCT__entry( __field(int, nodeid) __field(int, ret) ), TP_fast_assign( __entry->nodeid = nodeid; __entry->ret = ret; ), TP_printk("nodeid=%d ret=%d", __entry->nodeid, __entry->ret) ); TRACE_EVENT(dlm_recv, TP_PROTO(int nodeid, int ret), TP_ARGS(nodeid, ret), TP_STRUCT__entry( __field(int, nodeid) __field(int, ret) ), TP_fast_assign( __entry->nodeid = nodeid; __entry->ret = ret; ), TP_printk("nodeid=%d ret=%d", __entry->nodeid, __entry->ret) ); #endif /* if !defined(_TRACE_DLM_H) || defined(TRACE_HEADER_MULTI_READ) */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 1 4 4 2 3 1 6 1 3 7 3 7 3 3 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Interfaces handler. * * Version: @(#)dev.h 1.0.10 08/12/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Donald J. Becker, <becker@cesdis.gsfc.nasa.gov> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Bjorn Ekwall. <bj0rn@blox.se> * Pekka Riikonen <priikone@poseidon.pspt.fi> * * Moved to /usr/include/linux for NET3 */ #ifndef _LINUX_NETDEVICE_H #define _LINUX_NETDEVICE_H #include <linux/timer.h> #include <linux/bug.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/prefetch.h> #include <asm/cache.h> #include <asm/byteorder.h> #include <asm/local.h> #include <linux/percpu.h> #include <linux/rculist.h> #include <linux/workqueue.h> #include <linux/dynamic_queue_limits.h> #include <net/net_namespace.h> #ifdef CONFIG_DCB #include <net/dcbnl.h> #endif #include <net/netprio_cgroup.h> #include <linux/netdev_features.h> #include <linux/neighbour.h> #include <linux/netdevice_xmit.h> #include <uapi/linux/netdevice.h> #include <uapi/linux/if_bonding.h> #include <uapi/linux/pkt_cls.h> #include <uapi/linux/netdev.h> #include <linux/hashtable.h> #include <linux/rbtree.h> #include <net/net_trackers.h> #include <net/net_debug.h> #include <net/dropreason-core.h> struct netpoll_info; struct device; struct ethtool_ops; struct kernel_hwtstamp_config; struct phy_device; struct dsa_port; struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; struct netdev_name_node; struct sd_flow_limit; struct sfp_bus; /* 802.11 specific */ struct wireless_dev; /* 802.15.4 specific */ struct wpan_dev; struct mpls_dev; /* UDP Tunnel offloads */ struct udp_tunnel_info; struct udp_tunnel_nic_info; struct udp_tunnel_nic; struct bpf_prog; struct xdp_buff; struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; struct ethtool_netdev_state; typedef u32 xdp_features_t; void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); void netdev_sw_irq_coalesce_default_on(struct net_device *dev); /* Backlog congestion levels */ #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ #define MAX_NEST_DEV 8 /* * Transmit return codes: transmit return codes originate from three different * namespaces: * * - qdisc return codes * - driver transmit return codes * - errno values * * Drivers are allowed to return any one of those in their hard_start_xmit() * function. Real network devices commonly used with qdiscs should only return * the driver transmit return codes though - when qdiscs are used, the actual * transmission happens asynchronously, so the value is not propagated to * higher layers. Virtual network devices transmit synchronously; in this case * the driver transmit return codes are consumed by dev_queue_xmit(), and all * others are propagated to higher layers. */ /* qdisc ->enqueue() return codes. */ #define NET_XMIT_SUCCESS 0x00 #define NET_XMIT_DROP 0x01 /* skb dropped */ #define NET_XMIT_CN 0x02 /* congestion notification */ #define NET_XMIT_MASK 0x0f /* qdisc flags in net/sch_generic.h */ /* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It * indicates that the device will soon be dropping packets, or already drops * some packets of the same priority; prompting us to send less aggressively. */ #define net_xmit_eval(e) ((e) == NET_XMIT_CN ? 0 : (e)) #define net_xmit_errno(e) ((e) != NET_XMIT_CN ? -ENOBUFS : 0) /* Driver transmit return codes */ #define NETDEV_TX_MASK 0xf0 enum netdev_tx { __NETDEV_TX_MIN = INT_MIN, /* make sure enum is signed */ NETDEV_TX_OK = 0x00, /* driver took care of packet */ NETDEV_TX_BUSY = 0x10, /* driver tx path was busy*/ }; typedef enum netdev_tx netdev_tx_t; /* * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant; * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed. */ static inline bool dev_xmit_complete(int rc) { /* * Positive cases with an skb consumed by a driver: * - successful transmission (rc == NETDEV_TX_OK) * - error while transmitting (rc < 0) * - error while queueing to a different device (rc & NET_XMIT_MASK) */ if (likely(rc < NET_XMIT_MASK)) return true; return false; } /* * Compute the worst-case header length according to the protocols * used. */ #if defined(CONFIG_HYPERV_NET) # define LL_MAX_HEADER 128 #elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) # if defined(CONFIG_MAC80211_MESH) # define LL_MAX_HEADER 128 # else # define LL_MAX_HEADER 96 # endif #else # define LL_MAX_HEADER 32 #endif #if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \ !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL) #define MAX_HEADER LL_MAX_HEADER #else #define MAX_HEADER (LL_MAX_HEADER + 48) #endif /* * Old network device statistics. Fields are native words * (unsigned long) so they can be read and written atomically. */ #define NET_DEV_STAT(FIELD) \ union { \ unsigned long FIELD; \ atomic_long_t __##FIELD; \ } struct net_device_stats { NET_DEV_STAT(rx_packets); NET_DEV_STAT(tx_packets); NET_DEV_STAT(rx_bytes); NET_DEV_STAT(tx_bytes); NET_DEV_STAT(rx_errors); NET_DEV_STAT(tx_errors); NET_DEV_STAT(rx_dropped); NET_DEV_STAT(tx_dropped); NET_DEV_STAT(multicast); NET_DEV_STAT(collisions); NET_DEV_STAT(rx_length_errors); NET_DEV_STAT(rx_over_errors); NET_DEV_STAT(rx_crc_errors); NET_DEV_STAT(rx_frame_errors); NET_DEV_STAT(rx_fifo_errors); NET_DEV_STAT(rx_missed_errors); NET_DEV_STAT(tx_aborted_errors); NET_DEV_STAT(tx_carrier_errors); NET_DEV_STAT(tx_fifo_errors); NET_DEV_STAT(tx_heartbeat_errors); NET_DEV_STAT(tx_window_errors); NET_DEV_STAT(rx_compressed); NET_DEV_STAT(tx_compressed); }; #undef NET_DEV_STAT /* per-cpu stats, allocated on demand. * Try to fit them in a single cache line, for dev_get_stats() sake. */ struct net_device_core_stats { unsigned long rx_dropped; unsigned long tx_dropped; unsigned long rx_nohandler; unsigned long rx_otherhost_dropped; } __aligned(4 * sizeof(unsigned long)); #include <linux/cache.h> #include <linux/skbuff.h> struct neighbour; struct neigh_parms; struct sk_buff; struct netdev_hw_addr { struct list_head list; struct rb_node node; unsigned char addr[MAX_ADDR_LEN]; unsigned char type; #define NETDEV_HW_ADDR_T_LAN 1 #define NETDEV_HW_ADDR_T_SAN 2 #define NETDEV_HW_ADDR_T_UNICAST 3 #define NETDEV_HW_ADDR_T_MULTICAST 4 bool global_use; int sync_cnt; int refcount; int synced; struct rcu_head rcu_head; }; struct netdev_hw_addr_list { struct list_head list; int count; /* Auxiliary tree for faster lookup on addition and deletion */ struct rb_root tree; }; #define netdev_hw_addr_list_count(l) ((l)->count) #define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0) #define netdev_hw_addr_list_for_each(ha, l) \ list_for_each_entry(ha, &(l)->list, list) #define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc) #define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc) #define netdev_for_each_uc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->uc) #define netdev_for_each_synced_uc_addr(_ha, _dev) \ netdev_for_each_uc_addr((_ha), (_dev)) \ if ((_ha)->sync_cnt) #define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc) #define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc) #define netdev_for_each_mc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->mc) #define netdev_for_each_synced_mc_addr(_ha, _dev) \ netdev_for_each_mc_addr((_ha), (_dev)) \ if ((_ha)->sync_cnt) struct hh_cache { unsigned int hh_len; seqlock_t hh_lock; /* cached hardware header; allow for machine alignment needs. */ #define HH_DATA_MOD 16 #define HH_DATA_OFF(__len) \ (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1)) #define HH_DATA_ALIGN(__len) \ (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1)) unsigned long hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)]; }; /* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much. * Alternative is: * dev->hard_header_len ? (dev->hard_header_len + * (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0 * * We could use other alignment values, but we must maintain the * relationship HH alignment <= LL alignment. */ #define LL_RESERVED_SPACE(dev) \ ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \ & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) #define LL_RESERVED_SPACE_EXTRA(dev,extra) \ ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \ & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) struct header_ops { int (*create) (struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len); int (*parse)(const struct sk_buff *skb, unsigned char *haddr); int (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void (*cache_update)(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); bool (*validate)(const char *ll_header, unsigned int len); __be16 (*parse_protocol)(const struct sk_buff *skb); }; /* These flag bits are private to the generic network queueing * layer; they may not be explicitly referenced by any other * code. */ enum netdev_state_t { __LINK_STATE_START, __LINK_STATE_PRESENT, __LINK_STATE_NOCARRIER, __LINK_STATE_LINKWATCH_PENDING, __LINK_STATE_DORMANT, __LINK_STATE_TESTING, }; struct gro_list { struct list_head list; int count; }; /* * size of gro hash buckets, must less than bit number of * napi_struct::gro_bitmask */ #define GRO_HASH_BUCKETS 8 /* * Structure for NAPI scheduling similar to tasklet but with weighting */ struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means * whoever atomically sets that bit can add this napi_struct * to the per-CPU poll_list, and whoever clears that bit * can remove from the list right before clearing the bit. */ struct list_head poll_list; unsigned long state; int weight; int defer_hard_irqs_count; unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL /* CPU actively polling if netpoll is configured */ int poll_owner; #endif /* CPU on which NAPI has been scheduled for processing */ int list_owner; struct net_device *dev; struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; struct list_head rx_list; /* Pending GRO_NORMAL skbs */ int rx_count; /* length of rx_list */ unsigned int napi_id; struct hrtimer timer; struct task_struct *thread; /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; int irq; }; enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_MISSED, /* reschedule a napi */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ }; enum { NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), }; enum gro_result { GRO_MERGED, GRO_MERGED_FREE, GRO_HELD, GRO_NORMAL, GRO_CONSUMED, }; typedef enum gro_result gro_result_t; /* * enum rx_handler_result - Possible return values for rx_handlers. * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it * further. * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in * case skb->dev was changed by rx_handler. * @RX_HANDLER_EXACT: Force exact delivery, no wildcard. * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called. * * rx_handlers are functions called from inside __netif_receive_skb(), to do * special processing of the skb, prior to delivery to protocol handlers. * * Currently, a net_device can only have a single rx_handler registered. Trying * to register a second rx_handler will return -EBUSY. * * To register a rx_handler on a net_device, use netdev_rx_handler_register(). * To unregister a rx_handler on a net_device, use * netdev_rx_handler_unregister(). * * Upon return, rx_handler is expected to tell __netif_receive_skb() what to * do with the skb. * * If the rx_handler consumed the skb in some way, it should return * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for * the skb to be delivered in some other way. * * If the rx_handler changed skb->dev, to divert the skb to another * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the * new device will be called if it exists. * * If the rx_handler decides the skb should be ignored, it should return * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that * are registered on exact device (ptype->dev == skb->dev). * * If the rx_handler didn't change skb->dev, but wants the skb to be normally * delivered, it should return RX_HANDLER_PASS. * * A device without a registered rx_handler will behave as if rx_handler * returned RX_HANDLER_PASS. */ enum rx_handler_result { RX_HANDLER_CONSUMED, RX_HANDLER_ANOTHER, RX_HANDLER_EXACT, RX_HANDLER_PASS, }; typedef enum rx_handler_result rx_handler_result_t; typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb); void __napi_schedule(struct napi_struct *n); void __napi_schedule_irqoff(struct napi_struct *n); static inline bool napi_disable_pending(struct napi_struct *n) { return test_bit(NAPI_STATE_DISABLE, &n->state); } static inline bool napi_prefer_busy_poll(struct napi_struct *n) { return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); } /** * napi_is_scheduled - test if NAPI is scheduled * @n: NAPI context * * This check is "best-effort". With no locking implemented, * a NAPI can be scheduled or terminate right after this check * and produce not precise results. * * NAPI_STATE_SCHED is an internal state, napi_is_scheduled * should not be used normally and napi_schedule should be * used instead. * * Use only if the driver really needs to check if a NAPI * is scheduled for example in the context of delayed timer * that can be skipped if a NAPI is already scheduled. * * Return True if NAPI is scheduled, False otherwise. */ static inline bool napi_is_scheduled(struct napi_struct *n) { return test_bit(NAPI_STATE_SCHED, &n->state); } bool napi_schedule_prep(struct napi_struct *n); /** * napi_schedule - schedule NAPI poll * @n: NAPI context * * Schedule NAPI poll routine to be called if it is not already * running. * Return true if we schedule a NAPI or false if not. * Refer to napi_schedule_prep() for additional reason on why * a NAPI might not be scheduled. */ static inline bool napi_schedule(struct napi_struct *n) { if (napi_schedule_prep(n)) { __napi_schedule(n); return true; } return false; } /** * napi_schedule_irqoff - schedule NAPI poll * @n: NAPI context * * Variant of napi_schedule(), assuming hard irqs are masked. */ static inline void napi_schedule_irqoff(struct napi_struct *n) { if (napi_schedule_prep(n)) __napi_schedule_irqoff(n); } /** * napi_complete_done - NAPI processing complete * @n: NAPI context * @work_done: number of packets processed * * Mark NAPI processing as complete. Should only be called if poll budget * has not been completely consumed. * Prefer over napi_complete(). * Return false if device should avoid rearming interrupts. */ bool napi_complete_done(struct napi_struct *n, int work_done); static inline bool napi_complete(struct napi_struct *n) { return napi_complete_done(n, 0); } int dev_set_threaded(struct net_device *dev, bool threaded); /** * napi_disable - prevent NAPI from scheduling * @n: NAPI context * * Stop NAPI from being scheduled on this context. * Waits till any outstanding processing completes. */ void napi_disable(struct napi_struct *n); void napi_enable(struct napi_struct *n); /** * napi_synchronize - wait until NAPI is not running * @n: NAPI context * * Wait until NAPI is done being scheduled on this context. * Waits till any outstanding processing completes but * does not disable future activations. */ static inline void napi_synchronize(const struct napi_struct *n) { if (IS_ENABLED(CONFIG_SMP)) while (test_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); else barrier(); } /** * napi_if_scheduled_mark_missed - if napi is running, set the * NAPIF_STATE_MISSED * @n: NAPI context * * If napi is running, set the NAPIF_STATE_MISSED, and return true if * NAPI is scheduled. **/ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) { unsigned long val, new; val = READ_ONCE(n->state); do { if (val & NAPIF_STATE_DISABLE) return true; if (!(val & NAPIF_STATE_SCHED)) return false; new = val | NAPIF_STATE_MISSED; } while (!try_cmpxchg(&n->state, &val, new)); return true; } enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, __QUEUE_STATE_STACK_XOFF, __QUEUE_STATE_FROZEN, }; #define QUEUE_STATE_DRV_XOFF (1 << __QUEUE_STATE_DRV_XOFF) #define QUEUE_STATE_STACK_XOFF (1 << __QUEUE_STATE_STACK_XOFF) #define QUEUE_STATE_FROZEN (1 << __QUEUE_STATE_FROZEN) #define QUEUE_STATE_ANY_XOFF (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF) #define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \ QUEUE_STATE_FROZEN) #define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \ QUEUE_STATE_FROZEN) /* * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue. The * netif_tx_* functions below are used to manipulate this flag. The * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit * queue independently. The netif_xmit_*stopped functions below are called * to check if the queue has been stopped by the driver or stack (either * of the XOFF bits are set in the state). Drivers should not need to call * netif_xmit*stopped functions, they should only be using netif_tx_*. */ struct netdev_queue { /* * read-mostly part */ struct net_device *dev; netdevice_tracker dev_tracker; struct Qdisc __rcu *qdisc; struct Qdisc __rcu *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; #endif #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) int numa_node; #endif unsigned long tx_maxrate; /* * Number of TX timeouts for this queue * (/sys/class/net/DEV/Q/trans_timeout) */ atomic_long_t trans_timeout; /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif /* NAPI instance for the queue * Readers and writers must hold RTNL */ struct napi_struct *napi; /* * write-mostly part */ spinlock_t _xmit_lock ____cacheline_aligned_in_smp; int xmit_lock_owner; /* * Time (in jiffies) of last Tx */ unsigned long trans_start; unsigned long state; #ifdef CONFIG_BQL struct dql dql; #endif } ____cacheline_aligned_in_smp; extern int sysctl_fb_tunnels_only_for_init_net; extern int sysctl_devconf_inherit_init_net; /* * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns * == 1 : For initns only * == 2 : For none. */ static inline bool net_has_fallback_tunnels(const struct net *net) { #if IS_ENABLED(CONFIG_SYSCTL) int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net); return !fb_tunnels_only_for_init_net || (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1); #else return true; #endif } static inline int net_inherit_devconf(void) { #if IS_ENABLED(CONFIG_SYSCTL) return READ_ONCE(sysctl_devconf_inherit_init_net); #else return 0; #endif } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) return q->numa_node; #else return NUMA_NO_NODE; #endif } static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) q->numa_node = node; #endif } #ifdef CONFIG_RFS_ACCEL bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif /* XPS map type and offset of the xps map within net_device->xps_maps[]. */ enum xps_map_type { XPS_CPUS = 0, XPS_RXQS, XPS_MAPS_MAX, }; #ifdef CONFIG_XPS /* * This structure holds an XPS map which can be of variable length. The * map is an array of queues. */ struct xps_map { unsigned int len; unsigned int alloc_len; struct rcu_head rcu; u16 queues[]; }; #define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16))) #define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \ - sizeof(struct xps_map)) / sizeof(u16)) /* * This structure holds all XPS maps for device. Maps are indexed by CPU. * * We keep track of the number of cpus/rxqs used when the struct is allocated, * in nr_ids. This will help not accessing out-of-bound memory. * * We keep track of the number of traffic classes used when the struct is * allocated, in num_tc. This will be used to navigate the maps, to ensure we're * not crossing its upper bound, as the original dev->num_tc can be updated in * the meantime. */ struct xps_dev_maps { struct rcu_head rcu; unsigned int nr_ids; s16 num_tc; struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */ }; #define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *))) #define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\ (_rxqs * (_tcs) * sizeof(struct xps_map *))) #endif /* CONFIG_XPS */ #define TC_MAX_QUEUE 16 #define TC_BITMASK 15 /* HW offloaded queuing disciplines txq count and offset maps */ struct netdev_tc_txq { u16 count; u16 offset; }; #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) /* * This structure is to hold information about the device * configured to run FCoE protocol stack. */ struct netdev_fcoe_hbainfo { char manufacturer[64]; char serial_number[64]; char hardware_version[64]; char driver_version[64]; char optionrom_version[64]; char firmware_version[64]; char model[256]; char model_description[256]; }; #endif #define MAX_PHYS_ITEM_ID_LEN 32 /* This structure holds a unique identifier to identify some * physical item (port for example) used by a netdevice. */ struct netdev_phys_item_id { unsigned char id[MAX_PHYS_ITEM_ID_LEN]; unsigned char id_len; }; static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, struct netdev_phys_item_id *b) { return a->id_len == b->id_len && memcmp(a->id, b->id, a->id_len) == 0; } typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); enum net_device_path_type { DEV_PATH_ETHERNET = 0, DEV_PATH_VLAN, DEV_PATH_BRIDGE, DEV_PATH_PPPOE, DEV_PATH_DSA, DEV_PATH_MTK_WDMA, }; struct net_device_path { enum net_device_path_type type; const struct net_device *dev; union { struct { u16 id; __be16 proto; u8 h_dest[ETH_ALEN]; } encap; struct { enum { DEV_PATH_BR_VLAN_KEEP, DEV_PATH_BR_VLAN_TAG, DEV_PATH_BR_VLAN_UNTAG, DEV_PATH_BR_VLAN_UNTAG_HW, } vlan_mode; u16 vlan_id; __be16 vlan_proto; } bridge; struct { int port; u16 proto; } dsa; struct { u8 wdma_idx; u8 queue; u16 wcid; u8 bss; u8 amsdu; } mtk_wdma; }; }; #define NET_DEVICE_PATH_STACK_MAX 5 #define NET_DEVICE_PATH_VLAN_MAX 2 struct net_device_path_stack { int num_paths; struct net_device_path path[NET_DEVICE_PATH_STACK_MAX]; }; struct net_device_path_ctx { const struct net_device *dev; u8 daddr[ETH_ALEN]; int num_vlans; struct { u16 id; __be16 proto; } vlan[NET_DEVICE_PATH_VLAN_MAX]; }; enum tc_setup_type { TC_QUERY_CAPS, TC_SETUP_QDISC_MQPRIO, TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, TC_SETUP_CLSMATCHALL, TC_SETUP_CLSBPF, TC_SETUP_BLOCK, TC_SETUP_QDISC_CBS, TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, TC_SETUP_QDISC_ETF, TC_SETUP_ROOT_QDISC, TC_SETUP_QDISC_GRED, TC_SETUP_QDISC_TAPRIO, TC_SETUP_FT, TC_SETUP_QDISC_ETS, TC_SETUP_QDISC_TBF, TC_SETUP_QDISC_FIFO, TC_SETUP_QDISC_HTB, TC_SETUP_ACT, }; /* These structures hold the attributes of bpf state that are being passed * to the netdevice through the bpf op. */ enum bpf_netdev_command { /* Set or clear a bpf program used in the earliest stages of packet * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee * is responsible for calling bpf_prog_put on any old progs that are * stored. In case of error, the callee need not release the new prog * reference, but on success it takes ownership and must bpf_prog_put * when it is no longer used. */ XDP_SETUP_PROG, XDP_SETUP_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, XDP_SETUP_XSK_POOL, }; struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; struct bpf_xdp_link; enum bpf_xdp_mode { XDP_MODE_SKB = 0, XDP_MODE_DRV = 1, XDP_MODE_HW = 2, __MAX_XDP_MODE }; struct bpf_xdp_entity { struct bpf_prog *prog; struct bpf_xdp_link *link; }; struct netdev_bpf { enum bpf_netdev_command command; union { /* XDP_SETUP_PROG */ struct { u32 flags; struct bpf_prog *prog; struct netlink_ext_ack *extack; }; /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ struct { struct bpf_offloaded_map *offmap; }; /* XDP_SETUP_XSK_POOL */ struct { struct xsk_buff_pool *pool; u16 queue_id; } xsk; }; }; /* Flags for ndo_xsk_wakeup. */ #define XDP_WAKEUP_RX (1 << 0) #define XDP_WAKEUP_TX (1 << 1) #ifdef CONFIG_XFRM_OFFLOAD struct xfrmdev_ops { int (*xdo_dev_state_add) (struct xfrm_state *x, struct netlink_ext_ack *extack); void (*xdo_dev_state_delete) (struct xfrm_state *x); void (*xdo_dev_state_free) (struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); void (*xdo_dev_state_update_stats) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); }; #endif struct dev_ifalias { struct rcu_head rcuhead; char ifalias[]; }; struct devlink; struct tlsdev_ops; struct netdev_net_notifier { struct list_head list; struct notifier_block *nb; }; /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * * int (*ndo_init)(struct net_device *dev); * This function is called once when a network device is registered. * The network device can use this for any late stage initialization * or semantic validation. It can fail with an error code which will * be propagated back to register_netdev. * * void (*ndo_uninit)(struct net_device *dev); * This function is called when device is unregistered or when registration * fails. It is not called if init fails. * * int (*ndo_open)(struct net_device *dev); * This function is called when a network device transitions to the up * state. * * int (*ndo_stop)(struct net_device *dev); * This function is called when a network device transitions to the down * state. * * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, * struct net_device *dev); * Called when a packet needs to be transmitted. * Returns NETDEV_TX_OK. Can return NETDEV_TX_BUSY, but you should stop * the queue before that can happen; it's for obsolete devices and weird * corner cases, but the stack really does a non-trivial amount * of useless work if you return NETDEV_TX_BUSY. * Required; cannot be NULL. * * netdev_features_t (*ndo_features_check)(struct sk_buff *skb, * struct net_device *dev * netdev_features_t features); * Called by core transmit path to determine if device is capable of * performing offload operations on a given packet. This is to give * the device an opportunity to implement any restrictions that cannot * be otherwise expressed by feature flags. The check is called with * the set of features that the stack has calculated and it returns * those the driver believes to be appropriate. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * struct net_device *sb_dev); * Called to decide which queue to use when device supports multiple * transmit queues. * * void (*ndo_change_rx_flags)(struct net_device *dev, int flags); * This function is called to allow device receiver to make * changes to configuration when multicast or promiscuous is enabled. * * void (*ndo_set_rx_mode)(struct net_device *dev); * This function is called device changes address list filtering. * If driver handles unicast address filtering, it should set * IFF_UNICAST_FLT in its priv_flags. * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address * needs to be changed. If this interface is not defined, the * MAC address can not be changed. * * int (*ndo_validate_addr)(struct net_device *dev); * Test if Media Access Control address is valid for the device. * * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Old-style ioctl entry point. This is used internally by the * appletalk and ieee802154 subsystems but is no longer called by * the device ioctl handler. * * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); * Used by the bonding driver for its device specific ioctls: * SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE, * SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY * * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG, * SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP. * * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); * Used to set network devices bus interface parameters. This interface * is retained for legacy reasons; new devices should use the bus * interface (PCI) for low level management. * * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); * Called when a user wants to change the Maximum Transfer Unit * of a device. * * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue); * Callback used when the transmitter has not made any progress * for dev->watchdog ticks. * * void (*ndo_get_stats64)(struct net_device *dev, * struct rtnl_link_stats64 *storage); * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); * Called when a user wants to get the network device usage * statistics. Drivers must do one of the following: * 1. Define @ndo_get_stats64 to fill in a zero-initialised * rtnl_link_stats64 structure passed by the caller. * 2. Define @ndo_get_stats to update a net_device_stats structure * (which should normally be dev->stats) and return a pointer to * it. The structure may be changed asynchronously only if each * field is written atomically. * 3. Update dev->stats asynchronously and atomically, and define * neither operation. * * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id) * Return true if this device supports offload stats of this attr_id. * * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, * void *attr_data) * Get statistics for offload operations by attr_id. Write it into the * attr_data pointer. * * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid); * If device supports VLAN filtering this function is called when a * VLAN id is registered. * * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid); * If device supports VLAN filtering this function is called when a * VLAN id is unregistered. * * void (*ndo_poll_controller)(struct net_device *dev); * * SR-IOV management functions. * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac); * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, * u8 qos, __be16 proto); * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, * int max_tx_rate); * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_config)(struct net_device *dev, * int vf, struct ifla_vf_info *ivf); * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); * * Enable or disable the VF ability to query its RSS Redirection Table and * Hash Key. This is needed since on some devices VF share this information * with PF and querying it may introduce a theoretical security risk. * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type, * void *type_data); * Called to setup any 'tc' scheduler, classifier or action on @dev. * This is always called from the stack with the rtnl lock held and netif * tx queues stopped. This allows the netdevice to perform queue * management safely. * * Fiber Channel over Ethernet (FCoE) offload functions. * int (*ndo_fcoe_enable)(struct net_device *dev); * Called when the FCoE protocol stack wants to start using LLD for FCoE * so the underlying device can perform whatever needed configuration or * initialization to support acceleration of FCoE traffic. * * int (*ndo_fcoe_disable)(struct net_device *dev); * Called when the FCoE protocol stack wants to stop using LLD for FCoE * so the underlying device can perform whatever needed clean-ups to * stop supporting acceleration of FCoE traffic. * * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid, * struct scatterlist *sgl, unsigned int sgc); * Called when the FCoE Initiator wants to initialize an I/O that * is a possible candidate for Direct Data Placement (DDP). The LLD can * perform necessary setup and returns 1 to indicate the device is set up * successfully to perform DDP on this I/O, otherwise this returns 0. * * int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); * Called when the FCoE Initiator/Target is done with the DDPed I/O as * indicated by the FC exchange id 'xid', so the underlying device can * clean up and reuse resources for later DDP requests. * * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid, * struct scatterlist *sgl, unsigned int sgc); * Called when the FCoE Target wants to initialize an I/O that * is a possible candidate for Direct Data Placement (DDP). The LLD can * perform necessary setup and returns 1 to indicate the device is set up * successfully to perform DDP on this I/O, otherwise this returns 0. * * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev, * struct netdev_fcoe_hbainfo *hbainfo); * Called when the FCoE Protocol stack wants information on the underlying * device. This information is utilized by the FCoE protocol stack to * register attributes with Fiber Channel management service as per the * FC-GS Fabric Device Management Information(FDMI) specification. * * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); * Called when the underlying device wants to override default World Wide * Name (WWN) generation mechanism in FCoE protocol stack to pass its own * World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE * protocol stack to use. * * RFS acceleration. * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, * u16 rxq_index, u32 flow_id); * Set hardware filter for RFS. rxq_index is the target queue index; * flow_id is a flow ID to be passed to rps_may_expire_flow() later. * Return the filter ID on success, or a negative error code. * * Slave management functions (for bridge, bonding, etc). * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to make another netdev an underling. * * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev, * struct sk_buff *skb, * bool all_slaves); * Get the xmit slave of master device. If all_slaves is true, function * assume all the slaves can transmit. * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); * Adjusts the requested feature flags according to device-specific * constraints, and returns the resulting flags. Must not modify * the device state. * * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); * Called to update device configuration to new features. Passed * feature set might be less than what was returned by ndo_fix_features()). * Must return >0 or -errno if it changed dev->features itself. * * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid, u16 flags, * struct netlink_ext_ack *extack); * Adds an FDB entry to dev for addr. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid) * Deletes the FDB entry from dev coresponding to addr. * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, * int *idx) * Used to add FDB entries to dump requests. Implementers should add * entries to skb and update idx with the number of entries. * * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[], * u16 nlmsg_flags, struct netlink_ext_ack *extack); * Adds an MDB entry to dev. * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Deletes the MDB entry from dev. * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Bulk deletes MDB entries from dev. * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, * struct netlink_callback *cb); * Dumps MDB entries from dev. The first argument (marker) in the netlink * callback is used by core rtnetlink code. * * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, * u16 flags, struct netlink_ext_ack *extack) * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, * struct net_device *dev, u32 filter_mask, * int nlflags) * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, * u16 flags); * * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); * Called to change device carrier. Soft-devices (like dummy, team, etc) * which do not represent real hardware may define this to allow their * userspace components to manage their virtual carrier state. Devices * that determine carrier state from physical hardware properties (eg * network cables) or protocol-dependent mechanisms (eg * USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function. * * int (*ndo_get_phys_port_id)(struct net_device *dev, * struct netdev_phys_item_id *ppid); * Called to get ID of physical port of this device. If driver does * not implement this, it is assumed that the hw is not able to have * multiple net devices on single physical port. * * int (*ndo_get_port_parent_id)(struct net_device *dev, * struct netdev_phys_item_id *ppid) * Called to get the parent ID of the physical port of this device. * * void* (*ndo_dfwd_add_station)(struct net_device *pdev, * struct net_device *dev) * Called by upper layer devices to accelerate switching or other * station functionality into hardware. 'pdev is the lowerdev * to use for the offload and 'dev' is the net device that will * back the offload. Returns a pointer to the private structure * the upper layer will maintain. * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv) * Called by upper layer device to delete the station created * by 'ndo_dfwd_add_station'. 'pdev' is the net device backing * the station and priv is the structure returned by the add * operation. * int (*ndo_set_tx_maxrate)(struct net_device *dev, * int queue_index, u32 maxrate); * Called when a user wants to set a max-rate limitation of specific * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while * sampling packet. * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); * This function is used to specify the headroom that the skb must * consider when allocation skb during packet reception. Setting * appropriate rx headroom value allows avoiding skb head copy on * forward. Setting a negative value resets the rx headroom to the * default value. * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, * u32 flags); * This function is used to submit @n XDP packets for transmit on a * netdevice. Returns number of frames successfully transmitted, frames * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev, * struct xdp_buff *xdp); * Get the xmit slave of master device based on the xdp_buff. * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); * This function is used to wake up the softirq, ksoftirqd or kthread * responsible for sending and/or receiving packets on a specific * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); * If a device is paired with a peer device, return the peer instance. * The caller must be under RCU read context. * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); * Get the forwarding path to reach the real device from the HW destination address * ktime_t (*ndo_get_tstamp)(struct net_device *dev, * const struct skb_shared_hwtstamps *hwtstamps, * bool cycles); * Get hardware timestamp based on normal/adjustable time or free running * cycle counter. This function is required if physical clock supports a * free running cycle counter. * * int (*ndo_hwtstamp_get)(struct net_device *dev, * struct kernel_hwtstamp_config *kernel_config); * Get the currently configured hardware timestamping parameters for the * NIC device. * * int (*ndo_hwtstamp_set)(struct net_device *dev, * struct kernel_hwtstamp_config *kernel_config, * struct netlink_ext_ack *extack); * Change the hardware timestamping parameters for NIC device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); void (*ndo_uninit)(struct net_device *dev); int (*ndo_open)(struct net_device *dev); int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, struct net_device *dev); netdev_features_t (*ndo_features_check)(struct sk_buff *skb, struct net_device *dev, netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_siocwandev)(struct net_device *dev, struct if_settings *ifs); int (*ndo_siocdevprivate)(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); int (*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *); void (*ndo_tx_timeout) (struct net_device *dev, unsigned int txqueue); void (*ndo_get_stats64)(struct net_device *dev, struct rtnl_link_stats64 *storage); bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id); int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, void *attr_data); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid); int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); int (*ndo_netpoll_setup)(struct net_device *dev, struct netpoll_info *info); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); int (*ndo_set_vf_vlan)(struct net_device *dev, int queue, u16 vlan, u8 qos, __be16 proto); int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate); int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting); int (*ndo_get_vf_config)(struct net_device *dev, int vf, struct ifla_vf_info *ivf); int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); int (*ndo_get_vf_stats)(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats); int (*ndo_set_vf_port)(struct net_device *dev, int vf, struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); int (*ndo_get_vf_guid)(struct net_device *dev, int vf, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int (*ndo_set_vf_guid)(struct net_device *dev, int vf, u64 guid, int guid_type); int (*ndo_set_vf_rss_query_en)( struct net_device *dev, int vf, bool setting); int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type, void *type_data); #if IS_ENABLED(CONFIG_FCOE) int (*ndo_fcoe_enable)(struct net_device *dev); int (*ndo_fcoe_disable)(struct net_device *dev); int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc); int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc); int (*ndo_fcoe_get_hbainfo)(struct net_device *dev, struct netdev_fcoe_hbainfo *hbainfo); #endif #if IS_ENABLED(CONFIG_LIBFCOE) #define NETDEV_FCOE_WWNN 0 #define NETDEV_FCOE_WWPN 1 int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); #endif #ifdef CONFIG_RFS_ACCEL int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, u16 rxq_index, u32 flow_id); #endif int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, struct sk_buff *skb, bool all_slaves); struct net_device* (*ndo_sk_get_lower_dev)(struct net_device *dev, struct sock *sk); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); int (*ndo_neigh_construct)(struct net_device *dev, struct neighbour *n); void (*ndo_neigh_destroy)(struct net_device *dev, struct neighbour *n); int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, struct netlink_ext_ack *extack); int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx); int (*ndo_fdb_get)(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack); int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int (*ndo_mdb_get)(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack); int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack); int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, u16 flags); int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); int (*ndo_get_port_parent_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); int (*ndo_get_phys_port_name)(struct net_device *dev, char *name, size_t len); void* (*ndo_dfwd_add_station)(struct net_device *pdev, struct net_device *dev); void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv); int (*ndo_set_tx_maxrate)(struct net_device *dev, int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); struct net_device * (*ndo_xdp_get_xmit_slave)(struct net_device *dev, struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); ktime_t (*ndo_get_tstamp)(struct net_device *dev, const struct skb_shared_hwtstamps *hwtstamps, bool cycles); int (*ndo_hwtstamp_get)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config); int (*ndo_hwtstamp_set)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); }; /** * enum netdev_priv_flags - &struct net_device priv_flags * * These are the &struct net_device, they are only set internally * by drivers and used in the kernel. These flags are invisible to * userspace; this means that the order of these flags can change * during any kernel release. * * You should have a pretty good reason to be extending these flags. * * @IFF_802_1Q_VLAN: 802.1Q VLAN device * @IFF_EBRIDGE: Ethernet bridging device * @IFF_BONDING: bonding master or slave * @IFF_ISATAP: ISATAP interface (RFC4214) * @IFF_WAN_HDLC: WAN HDLC device * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to * release skb->dst * @IFF_DONT_BRIDGE: disallow bridging this ether dev * @IFF_DISABLE_NETPOLL: disable netpoll at run-time * @IFF_MACVLAN_PORT: device used as macvlan port * @IFF_BRIDGE_PORT: device used as bridge port * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit * @IFF_UNICAST_FLT: Supports unicast filtering * @IFF_TEAM_PORT: device used as team port * @IFF_SUPP_NOFCS: device supports sending custom FCS * @IFF_LIVE_ADDR_CHANGE: device supports hardware address * change when it's running * @IFF_MACVLAN: Macvlan device * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account * underlying stacked devices * @IFF_L3MDEV_MASTER: device is an L3 master device * @IFF_NO_QUEUE: device can run without qdisc attached * @IFF_OPENVSWITCH: device is a Open vSwitch master * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device * @IFF_TEAM: device is a team device * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external * entity (i.e. the master device for bridged veth) * @IFF_MACSEC: device is a MACsec device * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device * @IFF_NO_ADDRCONF: prevent ipv6 addrconf * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN * @IFF_SEE_ALL_HWTSTAMP_REQUESTS: device wants to see calls to * ndo_hwtstamp_set() for all timestamp requests regardless of source, * even if those aren't HWTSTAMP_SOURCE_NETDEV. */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, IFF_EBRIDGE = 1<<1, IFF_BONDING = 1<<2, IFF_ISATAP = 1<<3, IFF_WAN_HDLC = 1<<4, IFF_XMIT_DST_RELEASE = 1<<5, IFF_DONT_BRIDGE = 1<<6, IFF_DISABLE_NETPOLL = 1<<7, IFF_MACVLAN_PORT = 1<<8, IFF_BRIDGE_PORT = 1<<9, IFF_OVS_DATAPATH = 1<<10, IFF_TX_SKB_SHARING = 1<<11, IFF_UNICAST_FLT = 1<<12, IFF_TEAM_PORT = 1<<13, IFF_SUPP_NOFCS = 1<<14, IFF_LIVE_ADDR_CHANGE = 1<<15, IFF_MACVLAN = 1<<16, IFF_XMIT_DST_RELEASE_PERM = 1<<17, IFF_L3MDEV_MASTER = 1<<18, IFF_NO_QUEUE = 1<<19, IFF_OPENVSWITCH = 1<<20, IFF_L3MDEV_SLAVE = 1<<21, IFF_TEAM = 1<<22, IFF_RXFH_CONFIGURED = 1<<23, IFF_PHONY_HEADROOM = 1<<24, IFF_MACSEC = 1<<25, IFF_NO_RX_HANDLER = 1<<26, IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), IFF_SEE_ALL_HWTSTAMP_REQUESTS = BIT_ULL(33), }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN #define IFF_EBRIDGE IFF_EBRIDGE #define IFF_BONDING IFF_BONDING #define IFF_ISATAP IFF_ISATAP #define IFF_WAN_HDLC IFF_WAN_HDLC #define IFF_XMIT_DST_RELEASE IFF_XMIT_DST_RELEASE #define IFF_DONT_BRIDGE IFF_DONT_BRIDGE #define IFF_DISABLE_NETPOLL IFF_DISABLE_NETPOLL #define IFF_MACVLAN_PORT IFF_MACVLAN_PORT #define IFF_BRIDGE_PORT IFF_BRIDGE_PORT #define IFF_OVS_DATAPATH IFF_OVS_DATAPATH #define IFF_TX_SKB_SHARING IFF_TX_SKB_SHARING #define IFF_UNICAST_FLT IFF_UNICAST_FLT #define IFF_TEAM_PORT IFF_TEAM_PORT #define IFF_SUPP_NOFCS IFF_SUPP_NOFCS #define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE #define IFF_MACVLAN IFF_MACVLAN #define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM #define IFF_L3MDEV_MASTER IFF_L3MDEV_MASTER #define IFF_NO_QUEUE IFF_NO_QUEUE #define IFF_OPENVSWITCH IFF_OPENVSWITCH #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE #define IFF_TEAM IFF_TEAM #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED #define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM #define IFF_MACSEC IFF_MACSEC #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /* Specifies the type of the struct net_device::ml_priv pointer */ enum netdev_ml_priv_type { ML_PRIV_NONE, ML_PRIV_CAN, }; enum netdev_stat_type { NETDEV_PCPU_STAT_NONE, NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */ NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */ NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ }; enum netdev_reg_state { NETREG_UNINITIALIZED = 0, NETREG_REGISTERED, /* completed register_netdevice */ NETREG_UNREGISTERING, /* called unregister_netdevice */ NETREG_UNREGISTERED, /* completed unregister todo */ NETREG_RELEASED, /* called free_netdev */ NETREG_DUMMY, /* dummy device for NAPI poll */ }; /** * struct net_device - The DEVICE structure. * * Actually, this whole structure is a big mistake. It mixes I/O * data with strictly "high-level" data, and it has to know about * almost every data structure used in the INET module. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name * of the interface. * * @name_node: Name hashlist node * @ifalias: SNMP alias * @mem_end: Shared memory end * @mem_start: Shared memory start * @base_addr: Device I/O address * @irq: Device IRQ number * * @state: Generic network queuing layer state, see netdev_state_t * @dev_list: The global list of network devices * @napi_list: List entry used for polling NAPI devices * @unreg_list: List entry when we are unregistering the * device; see the function unregister_netdev * @close_list: List entry used when we are closing the device * @ptype_all: Device-specific packet handlers for all protocols * @ptype_specific: Device-specific, protocol-specific packet handlers * * @adj_list: Directly linked devices, like slaves for bonding * @features: Currently active device features * @hw_features: User-changeable features * * @wanted_features: User-requested features * @vlan_features: Mask of features inheritable by VLAN devices * * @hw_enc_features: Mask of features inherited by encapsulating devices * This field indicates what encapsulation * offloads the hardware is capable of doing, * and drivers will need to set them appropriately. * * @mpls_features: Mask of features inheritable by MPLS * @gso_partial_features: value(s) from NETIF_F_GSO\* * * @ifindex: interface index * @group: The group the device belongs to * * @stats: Statistics struct, which was left as a legacy, use * rtnl_link_stats64 instead * * @core_stats: core networking counters, * do not use this in drivers * @carrier_up_count: Number of times the carrier has been up * @carrier_down_count: Number of times the carrier has been down * * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, * see <net/iw_handler.h> for details. * @wireless_data: Instance data managed by the core of wireless extensions * * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour * discovery handling. Necessary for e.g. 6LoWPAN. * @xfrmdev_ops: Transformation offload operations * @tlsdev_ops: Transport Layer Security offload operations * @header_ops: Includes callbacks for creating,parsing,caching,etc * of Layer 2 headers. * * @flags: Interface flags (a la BSD) * @xdp_features: XDP capability supported by the device * @priv_flags: Like 'flags' but invisible to userspace, * see if.h for the definitions * @gflags: Global flags ( kept as legacy ) * @priv_len: Size of the ->priv flexible array * @priv: Flexible array containing private data * @operstate: RFC2863 operstate * @link_mode: Mapping policy to operstate * @if_port: Selectable AUI, TP, ... * @dma: DMA channel * @mtu: Interface MTU value * @min_mtu: Interface Minimum MTU value * @max_mtu: Interface Maximum MTU value * @type: Interface hardware type * @hard_header_len: Maximum hardware header length. * @min_header_len: Minimum hardware header length * * @needed_headroom: Extra headroom the hardware may need, but not in all * cases can this be guaranteed * @needed_tailroom: Extra tailroom the hardware may need, but not in all * cases can this be guaranteed. Some cases also use * LL_MAX_HEADER instead to allocate the skb * * interface address info: * * @perm_addr: Permanent hw address * @addr_assign_type: Hw address assignment type * @addr_len: Hardware address length * @upper_level: Maximum depth level of upper devices. * @lower_level: Maximum depth level of lower devices. * @neigh_priv_len: Used in neigh_alloc() * @dev_id: Used to differentiate devices that share * the same link layer address * @dev_port: Used to differentiate devices that share * the same function * @addr_list_lock: XXX: need comments on this one * @name_assign_type: network interface name assignment type * @uc_promisc: Counter that indicates promiscuous mode * has been enabled due to the need to listen to * additional unicast addresses in a device that * does not implement ndo_set_rx_mode() * @uc: unicast mac addresses * @mc: multicast mac addresses * @dev_addrs: list of device hw addresses * @queues_kset: Group of all Kobjects in the Tx and RX queues * @promiscuity: Number of times the NIC is told to work in * promiscuous mode; if it becomes 0 the NIC will * exit promiscuous mode * @allmulti: Counter, enables or disables allmulticast mode * * @vlan_info: VLAN info * @dsa_ptr: dsa specific data * @tipc_ptr: TIPC specific data * @atalk_ptr: AppleTalk link * @ip_ptr: IPv4 specific data * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering * @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data * * @dev_addr: Hw address (before bcast, * because most packets are unicast) * * @_rx: Array of RX queues * @num_rx_queues: Number of RX queues * allocated at register_netdev() time * @real_num_rx_queues: Number of RX queues currently active in device * @xdp_prog: XDP sockets filter program pointer * @gro_flush_timeout: timeout for GRO layer in NAPI * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one * @tcx_ingress: BPF & clsact qdisc specific data for ingress processing * @ingress_queue: XXX: need comments on this one * @nf_hooks_ingress: netfilter hooks executed for ingress packets * @broadcast: hw bcast address * * @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts, * indexed by RX queue number. Assigned by driver. * This must only be set if the ndo_rx_flow_steer * operation is defined * @index_hlist: Device index hash chain * * @_tx: Array of TX queues * @num_tx_queues: Number of TX queues allocated at alloc_netdev_mq() time * @real_num_tx_queues: Number of TX queues currently active in device * @qdisc: Root qdisc from userspace point of view * @tx_queue_len: Max frames per queue allowed * @tx_global_lock: XXX: need comments on this one * @xdp_bulkq: XDP device bulk queue * @xps_maps: all CPUs/RXQs maps for XPS device * * @xps_maps: XXX: need comments on this one * @tcx_egress: BPF & clsact qdisc specific data for egress processing * @nf_hooks_egress: netfilter hooks executed for egress packets * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers * * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @dev_refcnt: Number of references to this device * @refcnt_tracker: Tracker directory for tracked references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * * @reg_state: Register/unregister state machine * @dismantle: Device is going to be freed * @rtnl_link_state: This enum represents the phases of creating * a new link * * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type * * @pcpu_stat_type: Type of device statistics which the core should * allocate/free: none, lstats, tstats, dstats. none * means the driver is handling statistics allocation/ * freeing internally. * @lstats: Loopback statistics: packets, bytes * @tstats: Tunnel statistics: RX/TX packets, RX/TX bytes * @dstats: Dummy statistics: RX/TX/drop packets, RX/TX bytes * * @garp_port: GARP * @mrp_port: MRP * * @dm_private: Drop monitor private * * @dev: Class/net/name entry * @sysfs_groups: Space for optional device, statistics and wireless * sysfs groups * * @sysfs_rx_queue_group: Space for optional per-rx queue attributes * @rtnl_link_ops: Rtnl_link_ops * @stat_ops: Optional ops for queue-aware statistics * @queue_mgmt_ops: Optional ops for queue management * * @gso_max_size: Maximum size of generic segmentation offload * @tso_max_size: Device (as in HW) limit on the max TSO request size * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO * @tso_max_segs: Device (as in HW) limit on the max TSO segment count * @gso_ipv4_max_size: Maximum size of generic segmentation offload, * for IPv4. * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device * @tc_to_txq: XXX: need comments on this one * @prio_tc_map: XXX: need comments on this one * * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the * switch port. * * @threaded: napi threaded mode is enabled * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved * to another network namespace. * * @macsec_ops: MACsec offloading ops * * @udp_tunnel_nic_info: static structure describing the UDP tunnel * offload capabilities of the device * @udp_tunnel_nic: UDP tunnel offload state * @ethtool: ethtool related state * @xdp_state: stores info on attached XDP BPF programs * * @nested_level: Used as a parameter of spin_lock_nested() of * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. * @gro_max_size: Maximum size of aggregated packet in generic * receive offload (GRO) * @gro_ipv4_max_size: Maximum size of aggregated packet in generic * receive offload (GRO), for IPv4. * @xdp_zc_max_segs: Maximum number of segments supported by AF_XDP * zero copy driver * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * @watchdog_dev_tracker: refcount tracker used by watchdog. * @dev_registered_tracker: tracker for reference held while * registered * @offload_xstats_l3: L3 HW stats for this netdevice. * * @devlink_port: Pointer to related devlink port structure. * Assigned by a driver before netdev registration using * SET_NETDEV_DEVLINK_PORT macro. This pointer is static * during the time netdevice is registered. * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ struct net_device { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/net_device.rst. * Please update the document when adding new fields. */ /* TX read-mostly hotpath */ __cacheline_group_begin(net_device_read_tx); unsigned long long priv_flags; const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; struct netdev_queue *_tx; netdev_features_t gso_partial_features; unsigned int real_num_tx_queues; unsigned int gso_max_size; unsigned int gso_ipv4_max_size; u16 gso_max_segs; s16 num_tc; /* Note : dev->mtu is often read without holding a lock. * Writers usually hold RTNL. * It is recommended to use READ_ONCE() to annotate the reads, * and to use WRITE_ONCE() to annotate the writes. */ unsigned int mtu; unsigned short needed_headroom; struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; #endif #ifdef CONFIG_NETFILTER_EGRESS struct nf_hook_entries __rcu *nf_hooks_egress; #endif #ifdef CONFIG_NET_XGRESS struct bpf_mprog_entry __rcu *tcx_egress; #endif __cacheline_group_end(net_device_read_tx); /* TXRX read-mostly hotpath */ __cacheline_group_begin(net_device_read_txrx); union { struct pcpu_lstats __percpu *lstats; struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; }; unsigned long state; unsigned int flags; unsigned short hard_header_len; netdev_features_t features; struct inet6_dev __rcu *ip6_ptr; __cacheline_group_end(net_device_read_txrx); /* RX read-mostly hotpath */ __cacheline_group_begin(net_device_read_rx); struct bpf_prog __rcu *xdp_prog; struct list_head ptype_specific; int ifindex; unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; possible_net_t nd_net; #ifdef CONFIG_NETPOLL struct netpoll_info __rcu *npinfo; #endif #ifdef CONFIG_NET_XGRESS struct bpf_mprog_entry __rcu *tcx_ingress; #endif __cacheline_group_end(net_device_read_rx); char name[IFNAMSIZ]; struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; /* * I/O specific fields * FIXME: Merge these and struct ifmap into one */ unsigned long mem_end; unsigned long mem_start; unsigned long base_addr; /* * Some hardware also needs these fields (state,dev_list, * napi_list,unreg_list,close_list) but they are not * part of the usual set specified in Space.c. */ struct list_head dev_list; struct list_head napi_list; struct list_head unreg_list; struct list_head close_list; struct list_head ptype_all; struct { struct list_head upper; struct list_head lower; } adj_list; /* Read-mostly cache-line for fast-path access */ xdp_features_t xdp_features; const struct xdp_metadata_ops *xdp_metadata_ops; const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; unsigned short gflags; unsigned short needed_tailroom; netdev_features_t hw_features; netdev_features_t wanted_features; netdev_features_t vlan_features; netdev_features_t hw_enc_features; netdev_features_t mpls_features; unsigned int min_mtu; unsigned int max_mtu; unsigned short type; unsigned char min_header_len; unsigned char name_assign_type; int group; struct net_device_stats stats; /* not used by modern drivers */ struct net_device_core_stats __percpu *core_stats; /* Stats to monitor link on/off, flapping */ atomic_t carrier_up_count; atomic_t carrier_down_count; #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def *wireless_handlers; struct iw_public_data *wireless_data; #endif const struct ethtool_ops *ethtool_ops; #ifdef CONFIG_NET_L3_MASTER_DEV const struct l3mdev_ops *l3mdev_ops; #endif #if IS_ENABLED(CONFIG_IPV6) const struct ndisc_ops *ndisc_ops; #endif #ifdef CONFIG_XFRM_OFFLOAD const struct xfrmdev_ops *xfrmdev_ops; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) const struct tlsdev_ops *tlsdev_ops; #endif unsigned int operstate; unsigned char link_mode; unsigned char if_port; unsigned char dma; /* Interface address info. */ unsigned char perm_addr[MAX_ADDR_LEN]; unsigned char addr_assign_type; unsigned char addr_len; unsigned char upper_level; unsigned char lower_level; unsigned short neigh_priv_len; unsigned short dev_id; unsigned short dev_port; int irq; u32 priv_len; spinlock_t addr_list_lock; struct netdev_hw_addr_list uc; struct netdev_hw_addr_list mc; struct netdev_hw_addr_list dev_addrs; #ifdef CONFIG_SYSFS struct kset *queues_kset; #endif #ifdef CONFIG_LOCKDEP struct list_head unlink_list; #endif unsigned int promiscuity; unsigned int allmulti; bool uc_promisc; #ifdef CONFIG_LOCKDEP unsigned char nested_level; #endif /* Protocol-specific pointers */ struct in_device __rcu *ip_ptr; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif #if IS_ENABLED(CONFIG_NET_DSA) struct dsa_port *dsa_ptr; #endif #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; #endif #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif #if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; #endif #if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; #endif #if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) struct wpan_dev *ieee802154_ptr; #endif #if IS_ENABLED(CONFIG_MPLS_ROUTING) struct mpls_dev __rcu *mpls_ptr; #endif #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif /* * Cache lines mostly used on receive path (including eth_type_trans()) */ /* Interface address info used in eth_type_trans() */ const unsigned char *dev_addr; unsigned int num_rx_queues; #define GRO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GRO_MAX_SIZE (8 * 65535u) unsigned int xdp_zc_max_segs; struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS struct nf_hook_entries __rcu *nf_hooks_ingress; #endif unsigned char broadcast[MAX_ADDR_LEN]; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rx_cpu_rmap; #endif struct hlist_node index_hlist; /* * Cache lines mostly used on transmit path */ unsigned int num_tx_queues; struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; struct xdp_dev_bulk_queue __percpu *xdp_bulkq; #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); #endif /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; int watchdog_timeo; u32 proto_down_reason; struct list_head todo_list; #ifdef CONFIG_PCPU_DEV_REFCNT int __percpu *pcpu_refcnt; #else refcount_t dev_refcnt; #endif struct ref_tracker_dir refcnt_tracker; struct list_head link_watch_list; u8 reg_state; bool dismantle; enum { RTNL_LINK_INITIALIZED, RTNL_LINK_INITIALIZING, } rtnl_link_state:16; bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); /* mid-layer private */ void *ml_priv; enum netdev_ml_priv_type ml_priv_type; enum netdev_stat_type pcpu_stat_type:8; #if IS_ENABLED(CONFIG_GARP) struct garp_port __rcu *garp_port; #endif #if IS_ENABLED(CONFIG_MRP) struct mrp_port __rcu *mrp_port; #endif #if IS_ENABLED(CONFIG_NET_DROP_MONITOR) struct dm_hw_stat_delta __rcu *dm_private; #endif struct device dev; const struct attribute_group *sysfs_groups[4]; const struct attribute_group *sysfs_rx_queue_group; const struct rtnl_link_ops *rtnl_link_ops; const struct netdev_stat_ops *stat_ops; const struct netdev_queue_mgmt_ops *queue_mgmt_ops; /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif u8 prio_tc_map[TC_BITMASK + 1]; #if IS_ENABLED(CONFIG_FCOE) unsigned int fcoe_ddp_xid; #endif #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; bool proto_down; bool threaded; struct list_head net_notifier_list; #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif const struct udp_tunnel_nic_info *udp_tunnel_nic_info; struct udp_tunnel_nic *udp_tunnel_nic; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; struct rtnl_hw_stats64 *offload_xstats_l3; struct devlink_port *devlink_port; #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin __rcu *dpll_pin; #endif #if IS_ENABLED(CONFIG_PAGE_POOL) /** @page_pools: page pools created for this netdevice */ struct hlist_head page_pools; #endif /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; #define to_net_dev(d) container_of(d, struct net_device, dev) /* * Driver should use this to assign devlink port instance to a netdevice * before it registers the netdevice. Therefore devlink_port is static * during the netdev lifetime after it is registered. */ #define SET_NETDEV_DEVLINK_PORT(dev, port) \ ({ \ WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED); \ ((dev)->devlink_port = (port)); \ }) static inline bool netif_elide_gro(const struct net_device *dev) { if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) return true; return false; } #define NETDEV_ALIGN 32 static inline int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio) { return dev->prio_tc_map[prio & TC_BITMASK]; } static inline int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc) { if (tc >= dev->num_tc) return -EINVAL; dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK; return 0; } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq); void netdev_reset_tc(struct net_device *dev); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset); int netdev_set_num_tc(struct net_device *dev, u8 num_tc); static inline int netdev_get_num_tc(struct net_device *dev) { return dev->num_tc; } static inline void net_prefetch(void *p) { prefetch(p); #if L1_CACHE_BYTES < 128 prefetch((u8 *)p + L1_CACHE_BYTES); #endif } static inline void net_prefetchw(void *p) { prefetchw(p); #if L1_CACHE_BYTES < 128 prefetchw((u8 *)p + L1_CACHE_BYTES); #endif } void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset); int netdev_set_sb_channel(struct net_device *dev, u16 channel); static inline int netdev_get_sb_channel(struct net_device *dev) { return max_t(int, -dev->num_tc, 0); } static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) { DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues); return &dev->_tx[index]; } static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev, const struct sk_buff *skb) { return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); } static inline void netdev_for_each_tx_queue(struct net_device *dev, void (*f)(struct net_device *, struct netdev_queue *, void *), void *arg) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) f(dev, &dev->_tx[i], arg); } #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ &qdisc_xmit_lock_key); \ } u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); /* returns the headroom that the master device needs to take in account * when forwarding to this dev */ static inline unsigned netdev_get_fwd_headroom(struct net_device *dev) { return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom; } static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr) { if (dev->netdev_ops->ndo_set_rx_headroom) dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr); } /* set the device rx headroom to the dev's default */ static inline void netdev_reset_rx_headroom(struct net_device *dev) { netdev_set_rx_headroom(dev, -1); } static inline void *netdev_get_ml_priv(struct net_device *dev, enum netdev_ml_priv_type type) { if (dev->ml_priv_type != type) return NULL; return dev->ml_priv; } static inline void netdev_set_ml_priv(struct net_device *dev, void *ml_priv, enum netdev_ml_priv_type type) { WARN(dev->ml_priv_type && dev->ml_priv_type != type, "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n", dev->ml_priv_type, type); WARN(!dev->ml_priv_type && dev->ml_priv, "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n"); dev->ml_priv = ml_priv; dev->ml_priv_type = type; } /* * Net namespace inlines */ static inline struct net *dev_net(const struct net_device *dev) { return read_pnet(&dev->nd_net); } static inline void dev_net_set(struct net_device *dev, struct net *net) { write_pnet(&dev->nd_net, net); } /** * netdev_priv - access network device private data * @dev: network device * * Get network device private data */ static inline void *netdev_priv(const struct net_device *dev) { return (void *)dev->priv; } /* Set the sysfs physical device reference for the network logical device * if set prior to registration will cause a symlink during initialization. */ #define SET_NETDEV_DEV(net, pdev) ((net)->dev.parent = (pdev)) /* Set the sysfs device type for the network logical device to allow * fine-grained identification of different network device types. For * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc. */ #define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype)) void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi); static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { napi->irq = irq; } /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ #define NAPI_POLL_WEIGHT 64 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight); /** * netif_napi_add() - initialize a NAPI context * @dev: network device * @napi: NAPI context * @poll: polling function * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ static inline void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int)) { netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } static inline void netif_napi_add_tx_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); netif_napi_add_weight(dev, napi, poll, weight); } /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device * @napi: NAPI context * @poll: polling function * * This variant of netif_napi_add() should be used from drivers using NAPI * to exclusively poll a TX queue. * This will avoid we add it into napi_hash[], thus polluting this hash table. */ static inline void netif_napi_add_tx(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int)) { netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } /** * __netif_napi_del - remove a NAPI context * @napi: NAPI context * * Warning: caller must observe RCU grace period before freeing memory * containing @napi. Drivers might want to call this helper to combine * all the needed RCU grace periods into a single one. */ void __netif_napi_del(struct napi_struct *napi); /** * netif_napi_del - remove a NAPI context * @napi: NAPI context * * netif_napi_del() removes a NAPI context from the network device NAPI list */ static inline void netif_napi_del(struct napi_struct *napi) { __netif_napi_del(napi); synchronize_net(); } struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; struct net_device *dev; /* NULL is wildcarded here */ netdevice_tracker dev_tracker; int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*list_func) (struct list_head *, struct packet_type *, struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); struct net *af_packet_net; void *af_packet_priv; struct list_head list; }; struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); struct sk_buff *(*gro_receive)(struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; struct packet_offload { __be16 type; /* This is really htons(ether_type). */ u16 priority; struct offload_callbacks callbacks; struct list_head list; }; /* often modified stats are per-CPU, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; } __aligned(4 * sizeof(u64)); struct pcpu_dstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_drops; u64_stats_t tx_packets; u64_stats_t tx_bytes; u64_stats_t tx_drops; struct u64_stats_sync syncp; } __aligned(8 * sizeof(u64)); struct pcpu_lstats { u64_stats_t packets; u64_stats_t bytes; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->rx_bytes, len); u64_stats_inc(&tstats->rx_packets); u64_stats_update_end(&tstats->syncp); } static inline void dev_sw_netstats_tx_add(struct net_device *dev, unsigned int packets, unsigned int len) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, len); u64_stats_add(&tstats->tx_packets, packets); u64_stats_update_end(&tstats->syncp); } static inline void dev_lstats_add(struct net_device *dev, unsigned int len) { struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); u64_stats_update_begin(&lstats->syncp); u64_stats_add(&lstats->bytes, len); u64_stats_inc(&lstats->packets); u64_stats_update_end(&lstats->syncp); } #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ if (pcpu_stats) { \ int __cpu; \ for_each_possible_cpu(__cpu) { \ typeof(type) *stat; \ stat = per_cpu_ptr(pcpu_stats, __cpu); \ u64_stats_init(&stat->syncp); \ } \ } \ pcpu_stats; \ }) #define netdev_alloc_pcpu_stats(type) \ __netdev_alloc_pcpu_stats(type, GFP_KERNEL) #define devm_netdev_alloc_pcpu_stats(dev, type) \ ({ \ typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\ if (pcpu_stats) { \ int __cpu; \ for_each_possible_cpu(__cpu) { \ typeof(type) *stat; \ stat = per_cpu_ptr(pcpu_stats, __cpu); \ u64_stats_init(&stat->syncp); \ } \ } \ pcpu_stats; \ }) enum netdev_lag_tx_type { NETDEV_LAG_TX_TYPE_UNKNOWN, NETDEV_LAG_TX_TYPE_RANDOM, NETDEV_LAG_TX_TYPE_BROADCAST, NETDEV_LAG_TX_TYPE_ROUNDROBIN, NETDEV_LAG_TX_TYPE_ACTIVEBACKUP, NETDEV_LAG_TX_TYPE_HASH, }; enum netdev_lag_hash { NETDEV_LAG_HASH_NONE, NETDEV_LAG_HASH_L2, NETDEV_LAG_HASH_L34, NETDEV_LAG_HASH_L23, NETDEV_LAG_HASH_E23, NETDEV_LAG_HASH_E34, NETDEV_LAG_HASH_VLAN_SRCMAC, NETDEV_LAG_HASH_UNKNOWN, }; struct netdev_lag_upper_info { enum netdev_lag_tx_type tx_type; enum netdev_lag_hash hash_type; }; struct netdev_lag_lower_state_info { u8 link_up : 1, tx_enabled : 1; }; #include <linux/notifier.h> /* netdevice notifier chain. Please remember to update netdev_cmd_to_name() * and the rtnetlink notification exclusion list in rtnetlink_event() when * adding new types. */ enum netdev_cmd { NETDEV_UP = 1, /* For now you can't veto a device up/down */ NETDEV_DOWN, NETDEV_REBOOT, /* Tell a protocol stack a network interface detected a hardware crash and restarted - we can use this eg to kick tcp sessions once done */ NETDEV_CHANGE, /* Notify device state change */ NETDEV_REGISTER, NETDEV_UNREGISTER, NETDEV_CHANGEMTU, /* notify after mtu change happened */ NETDEV_CHANGEADDR, /* notify after the address change */ NETDEV_PRE_CHANGEADDR, /* notify before the address change */ NETDEV_GOING_DOWN, NETDEV_CHANGENAME, NETDEV_FEAT_CHANGE, NETDEV_BONDING_FAILOVER, NETDEV_PRE_UP, NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, NETDEV_PRE_UNINIT, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, NETDEV_CHANGEUPPER, NETDEV_RESEND_IGMP, NETDEV_PRECHANGEMTU, /* notify before mtu change happened */ NETDEV_CHANGEINFODATA, NETDEV_BONDING_INFO, NETDEV_PRECHANGEUPPER, NETDEV_CHANGELOWERSTATE, NETDEV_UDP_TUNNEL_PUSH_INFO, NETDEV_UDP_TUNNEL_DROP_INFO, NETDEV_CHANGE_TX_QUEUE_LEN, NETDEV_CVLAN_FILTER_PUSH_INFO, NETDEV_CVLAN_FILTER_DROP_INFO, NETDEV_SVLAN_FILTER_PUSH_INFO, NETDEV_SVLAN_FILTER_DROP_INFO, NETDEV_OFFLOAD_XSTATS_ENABLE, NETDEV_OFFLOAD_XSTATS_DISABLE, NETDEV_OFFLOAD_XSTATS_REPORT_USED, NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, NETDEV_XDP_FEAT_CHANGE, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); struct netdev_notifier_info { struct net_device *dev; struct netlink_ext_ack *extack; }; struct netdev_notifier_info_ext { struct netdev_notifier_info info; /* must be first */ union { u32 mtu; } ext; }; struct netdev_notifier_change_info { struct netdev_notifier_info info; /* must be first */ unsigned int flags_changed; }; struct netdev_notifier_changeupper_info { struct netdev_notifier_info info; /* must be first */ struct net_device *upper_dev; /* new upper dev */ bool master; /* is upper dev master */ bool linking; /* is the notification for link or unlink */ void *upper_info; /* upper dev info */ }; struct netdev_notifier_changelowerstate_info { struct netdev_notifier_info info; /* must be first */ void *lower_state_info; /* is lower dev state */ }; struct netdev_notifier_pre_changeaddr_info { struct netdev_notifier_info info; /* must be first */ const unsigned char *dev_addr; }; enum netdev_offload_xstats_type { NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1, }; struct netdev_notifier_offload_xstats_info { struct netdev_notifier_info info; /* must be first */ enum netdev_offload_xstats_type type; union { /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */ struct netdev_notifier_offload_xstats_rd *report_delta; /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */ struct netdev_notifier_offload_xstats_ru *report_used; }; }; int netdev_offload_xstats_enable(struct net_device *dev, enum netdev_offload_xstats_type type, struct netlink_ext_ack *extack); int netdev_offload_xstats_disable(struct net_device *dev, enum netdev_offload_xstats_type type); bool netdev_offload_xstats_enabled(const struct net_device *dev, enum netdev_offload_xstats_type type); int netdev_offload_xstats_get(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *stats, bool *used, struct netlink_ext_ack *extack); void netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd, const struct rtnl_hw_stats64 *stats); void netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru); void netdev_offload_xstats_push_delta(struct net_device *dev, enum netdev_offload_xstats_type type, const struct rtnl_hw_stats64 *stats); static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { info->dev = dev; info->extack = NULL; } static inline struct net_device * netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) { return info->dev; } static inline struct netlink_ext_ack * netdev_notifier_info_to_extack(const struct netdev_notifier_info *info) { return info->extack; } int call_netdevice_notifiers(unsigned long val, struct net_device *dev); int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); #define for_each_netdev(net, d) \ list_for_each_entry(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_reverse(net, d) \ list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_rcu(net, d) \ list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_safe(net, d, n) \ list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue(net, d) \ list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue_reverse(net, d) \ list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \ dev_list) #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ for_each_netdev_rcu(&init_net, slave) \ if (netdev_master_upper_dev_get_rcu(slave) == (bond)) #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) #define for_each_netdev_dump(net, d, ifindex) \ for (; (d = xa_find(&(net)->dev_by_index, &ifindex, \ ULONG_MAX, XA_PRESENT)); ifindex++) static inline struct net_device *next_net_device(struct net_device *dev) { struct list_head *lh; struct net *net; net = dev_net(dev); lh = dev->dev_list.next; return lh == &net->dev_base_head ? NULL : net_device_entry(lh); } static inline struct net_device *next_net_device_rcu(struct net_device *dev) { struct list_head *lh; struct net *net; net = dev_net(dev); lh = rcu_dereference(list_next_rcu(&dev->dev_list)); return lh == &net->dev_base_head ? NULL : net_device_entry(lh); } static inline struct net_device *first_net_device(struct net *net) { return list_empty(&net->dev_base_head) ? NULL : net_device_entry(net->dev_base_head.next); } static inline struct net_device *first_net_device_rcu(struct net *net) { struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head)); return lh == &net->dev_base_head ? NULL : net_device_entry(lh); } int netdev_boot_setup_check(struct net_device *dev); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); void dev_add_pack(struct packet_type *pt); void dev_remove_pack(struct packet_type *pt); void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); int dev_get_iflink(const struct net_device *dev); int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, struct net_device_path_stack *stack); struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, unsigned short mask); struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); bool netdev_name_in_use(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); void dev_close(struct net_device *dev); void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); static inline int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); } static inline int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) { return __dev_queue_xmit(skb, sb_dev); } static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { int ret; ret = __dev_direct_xmit(skb, queue_id); if (!dev_xmit_complete(ret)) kfree_skb(skb); return ret; } int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); } int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); void init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves); struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { if (!dev->header_ops || !dev->header_ops->create) return 0; return dev->header_ops->create(skb, dev, type, daddr, saddr, len); } static inline int dev_parse_header(const struct sk_buff *skb, unsigned char *haddr) { const struct net_device *dev = skb->dev; if (!dev->header_ops || !dev->header_ops->parse) return 0; return dev->header_ops->parse(skb, haddr); } static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb) { const struct net_device *dev = skb->dev; if (!dev->header_ops || !dev->header_ops->parse_protocol) return 0; return dev->header_ops->parse_protocol(skb); } /* ll_header must have at least hard_header_len allocated */ static inline bool dev_validate_header(const struct net_device *dev, char *ll_header, int len) { if (likely(len >= dev->hard_header_len)) return true; if (len < dev->min_header_len) return false; if (capable(CAP_SYS_RAWIO)) { memset(ll_header + len, 0, dev->hard_header_len - len); return true; } if (dev->header_ops && dev->header_ops->validate) return dev->header_ops->validate(ll_header, len); return false; } static inline bool dev_has_header(const struct net_device *dev) { return dev->header_ops && dev->header_ops->create; } /* * Incoming packets are placed on per-CPU queues */ struct softnet_data { struct list_head poll_list; struct sk_buff_head process_queue; local_lock_t process_queue_bh_lock; /* stats */ unsigned int processed; unsigned int time_squeeze; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif unsigned int received_rps; bool in_net_rx_action; bool in_napi_threaded_poll; #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit __rcu *flow_limit; #endif struct Qdisc *output_queue; struct Qdisc **output_queue_tailp; struct sk_buff *completion_queue; #ifdef CONFIG_XFRM_OFFLOAD struct sk_buff_head xfrm_backlog; #endif /* written and read only by owning cpu: */ struct netdev_xmit xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, * and only read by other cpus. Worth using a cache line. */ unsigned int input_queue_head ____cacheline_aligned_in_smp; /* Elements below can be accessed between CPUs for RPS/RFS */ call_single_data_t csd ____cacheline_aligned_in_smp; struct softnet_data *rps_ipi_next; unsigned int cpu; unsigned int input_queue_tail; #endif struct sk_buff_head input_pkt_queue; struct napi_struct backlog; atomic_t dropped ____cacheline_aligned_in_smp; /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) { return this_cpu_read(softnet_data.xmit.recursion); } #else static inline int dev_recursion_level(void) { return current->net_xmit.recursion; } #endif void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); static inline void netif_tx_schedule_all(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) netif_schedule_queue(netdev_get_tx_queue(dev, i)); } static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue) { clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_start_queue - allow transmit * @dev: network device * * Allow upper layers to call the device hard_start_xmit routine. */ static inline void netif_start_queue(struct net_device *dev) { netif_tx_start_queue(netdev_get_tx_queue(dev, 0)); } static inline void netif_tx_start_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_start_queue(txq); } } void netif_tx_wake_queue(struct netdev_queue *dev_queue); /** * netif_wake_queue - restart transmit * @dev: network device * * Allow upper layers to call the device hard_start_xmit routine. * Used for flow control when transmit resources are available. */ static inline void netif_wake_queue(struct net_device *dev) { netif_tx_wake_queue(netdev_get_tx_queue(dev, 0)); } static inline void netif_tx_wake_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_wake_queue(txq); } } static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { /* Must be an atomic op see netif_txq_try_stop() */ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_stop_queue - stop transmitted packets * @dev: network device * * Stop upper layers calling the device hard_start_xmit routine. * Used for flow control when transmit resources are unavailable. */ static inline void netif_stop_queue(struct net_device *dev) { netif_tx_stop_queue(netdev_get_tx_queue(dev, 0)); } void netif_tx_stop_all_queues(struct net_device *dev); static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue) { return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_queue_stopped - test if transmit queue is flowblocked * @dev: network device * * Test if transmit queue on device is currently unable to send. */ static inline bool netif_queue_stopped(const struct net_device *dev) { return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0)); } static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_ANY_XOFF; } static inline bool netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN; } static inline bool netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN; } /** * netdev_queue_set_dql_min_limit - set dql minimum limit * @dev_queue: pointer to transmit queue * @min_limit: dql minimum limit * * Forces xmit_more() to return true until the minimum threshold * defined by @min_limit is reached (or until the tx queue is * empty). Warning: to be use with care, misuse will impact the * latency. */ static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue, unsigned int min_limit) { #ifdef CONFIG_BQL dev_queue->dql.min_limit = min_limit; #endif } static inline int netdev_queue_dql_avail(const struct netdev_queue *txq) { #ifdef CONFIG_BQL /* Non-BQL migrated drivers will return 0, too. */ return dql_avail(&txq->dql); #else return 0; #endif } /** * netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue * * BQL enabled drivers might use this helper in their ndo_start_xmit(), * to give appropriate hint to the CPU. */ static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue) { #ifdef CONFIG_BQL prefetchw(&dev_queue->dql.num_queued); #endif } /** * netdev_txq_bql_complete_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue * * BQL enabled drivers might use this helper in their TX completion path, * to give appropriate hint to the CPU. */ static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue) { #ifdef CONFIG_BQL prefetchw(&dev_queue->dql.limit); #endif } /** * netdev_tx_sent_queue - report the number of bytes queued to a given tx queue * @dev_queue: network device queue * @bytes: number of bytes queued to the device queue * * Report the number of bytes queued for sending/completion to the network * device hardware queue. @bytes should be a good approximation and should * exactly match netdev_completed_queue() @bytes. * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { #ifdef CONFIG_BQL dql_queued(&dev_queue->dql, bytes); if (likely(dql_avail(&dev_queue->dql) >= 0)) return; set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); /* * The XOFF flag must be set before checking the dql_avail below, * because in netdev_tx_completed_queue we update the dql_completed * before checking the XOFF flag. */ smp_mb(); /* check again in case another CPU has just made room avail */ if (unlikely(dql_avail(&dev_queue->dql) >= 0)) clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); #endif } /* Variant of netdev_tx_sent_queue() for drivers that are aware * that they should not test BQL status themselves. * We do want to change __QUEUE_STATE_STACK_XOFF only for the last * skb of a batch. * Returns true if the doorbell must be used to kick the NIC. */ static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes, bool xmit_more) { if (xmit_more) { #ifdef CONFIG_BQL dql_queued(&dev_queue->dql, bytes); #endif return netif_tx_queue_stopped(dev_queue); } netdev_tx_sent_queue(dev_queue, bytes); return true; } /** * netdev_sent_queue - report the number of bytes queued to hardware * @dev: network device * @bytes: number of bytes queued to the hardware device queue * * Report the number of bytes queued for sending/completion to the network * device hardware queue#0. @bytes should be a good approximation and should * exactly match netdev_completed_queue() @bytes. * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes) { netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes); } static inline bool __netdev_sent_queue(struct net_device *dev, unsigned int bytes, bool xmit_more) { return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes, xmit_more); } /** * netdev_tx_completed_queue - report number of packets/bytes at TX completion. * @dev_queue: network device queue * @pkts: number of packets (currently ignored) * @bytes: number of bytes dequeued from the device queue * * Must be called at most once per TX completion round (and not per * individual packet), so that BQL can adjust its limits appropriately. */ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, unsigned int pkts, unsigned int bytes) { #ifdef CONFIG_BQL if (unlikely(!bytes)) return; dql_completed(&dev_queue->dql, bytes); /* * Without the memory barrier there is a small possiblity that * netdev_tx_sent_queue will miss the update and cause the queue to * be stopped forever */ smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */ if (unlikely(dql_avail(&dev_queue->dql) < 0)) return; if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state)) netif_schedule_queue(dev_queue); #endif } /** * netdev_completed_queue - report bytes and packets completed by device * @dev: network device * @pkts: actual number of packets sent over the medium * @bytes: actual number of bytes sent over the medium * * Report the number of bytes and packets transmitted by the network device * hardware queue over the physical medium, @bytes must exactly match the * @bytes amount passed to netdev_sent_queue() */ static inline void netdev_completed_queue(struct net_device *dev, unsigned int pkts, unsigned int bytes) { netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes); } static inline void netdev_tx_reset_queue(struct netdev_queue *q) { #ifdef CONFIG_BQL clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state); dql_reset(&q->dql); #endif } /** * netdev_reset_queue - reset the packets and bytes count of a network device * @dev_queue: network device * * Reset the bytes and packet count of a network device and clear the * software flow control OFF bit for this network device */ static inline void netdev_reset_queue(struct net_device *dev_queue) { netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); } /** * netdev_cap_txqueue - check if selected tx queue exceeds device queues * @dev: network device * @queue_index: given tx queue index * * Returns 0 if given tx queue index >= number of device tx queues, * otherwise returns the originally passed tx queue index. */ static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index) { if (unlikely(queue_index >= dev->real_num_tx_queues)) { net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", dev->name, queue_index, dev->real_num_tx_queues); return 0; } return queue_index; } /** * netif_running - test if up * @dev: network device * * Test if the device has been brought up. */ static inline bool netif_running(const struct net_device *dev) { return test_bit(__LINK_STATE_START, &dev->state); } /* * Routines to manage the subqueues on a device. We only need start, * stop, and a check if it's stopped. All other device management is * done at the overall netdevice level. * Also test the device if we're multiqueue. */ /** * netif_start_subqueue - allow sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Start individual transmit queue of a device with multiple transmit queues. */ static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_start_queue(txq); } /** * netif_stop_subqueue - stop sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Stop individual transmit queue of a device with multiple transmit queues. */ static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_stop_queue(txq); } /** * __netif_subqueue_stopped - test status of subqueue * @dev: network device * @queue_index: sub queue index * * Check individual transmit queue of a device with multiple transmit queues. */ static inline bool __netif_subqueue_stopped(const struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); return netif_tx_queue_stopped(txq); } /** * netif_subqueue_stopped - test status of subqueue * @dev: network device * @skb: sub queue buffer pointer * * Check individual transmit queue of a device with multiple transmit queues. */ static inline bool netif_subqueue_stopped(const struct net_device *dev, struct sk_buff *skb) { return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb)); } /** * netif_wake_subqueue - allow sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Resume individual transmit queue of a device with multiple transmit queues. */ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_wake_queue(txq); } #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type); /** * netif_attr_test_mask - Test a CPU or Rx queue set in a mask * @j: CPU/Rx queue index * @mask: bitmask of all cpus/rx queues * @nr_bits: number of bits in the bitmask * * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues. */ static inline bool netif_attr_test_mask(unsigned long j, const unsigned long *mask, unsigned int nr_bits) { cpu_max_bits_warn(j, nr_bits); return test_bit(j, mask); } /** * netif_attr_test_online - Test for online CPU/Rx queue * @j: CPU/Rx queue index * @online_mask: bitmask for CPUs/Rx queues that are online * @nr_bits: number of bits in the bitmask * * Returns true if a CPU/Rx queue is online. */ static inline bool netif_attr_test_online(unsigned long j, const unsigned long *online_mask, unsigned int nr_bits) { cpu_max_bits_warn(j, nr_bits); if (online_mask) return test_bit(j, online_mask); return (j < nr_bits); } /** * netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask * @n: CPU/Rx queue index * @srcp: the cpumask/Rx queue mask pointer * @nr_bits: number of bits in the bitmask * * Returns >= nr_bits if no further CPUs/Rx queues set. */ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) { /* -1 is a legal arg here. */ if (n != -1) cpu_max_bits_warn(n, nr_bits); if (srcp) return find_next_bit(srcp, nr_bits, n + 1); return n + 1; } /** * netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p * @n: CPU/Rx queue index * @src1p: the first CPUs/Rx queues mask pointer * @src2p: the second CPUs/Rx queues mask pointer * @nr_bits: number of bits in the bitmask * * Returns >= nr_bits if no further CPUs/Rx queues set in both. */ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, unsigned int nr_bits) { /* -1 is a legal arg here. */ if (n != -1) cpu_max_bits_warn(n, nr_bits); if (src1p && src2p) return find_next_and_bit(src1p, src2p, nr_bits, n + 1); else if (src1p) return find_next_bit(src1p, nr_bits, n + 1); else if (src2p) return find_next_bit(src2p, nr_bits, n + 1); return n + 1; } #else static inline int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { return 0; } static inline int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { return 0; } #endif /** * netif_is_multiqueue - test if device has multiple transmit queues * @dev: network device * * Check if device has multiple transmit queues */ static inline bool netif_is_multiqueue(const struct net_device *dev) { return dev->num_tx_queues > 1; } int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); #ifdef CONFIG_SYSFS int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); #else static inline int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxqs) { dev->real_num_rx_queues = rxqs; return 0; } #endif int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq); int netif_get_num_default_rss_queues(void); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason); void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason); /* * It is not allowed to call kfree_skb() or consume_skb() from hardware * interrupt context or with hardware interrupts being disabled. * (in_hardirq() || irqs_disabled()) * * We provide four helpers that can be used in following contexts : * * dev_kfree_skb_irq(skb) when caller drops a packet from irq context, * replacing kfree_skb(skb) * * dev_consume_skb_irq(skb) when caller consumes a packet from irq context. * Typically used in place of consume_skb(skb) in TX completion path * * dev_kfree_skb_any(skb) when caller doesn't know its current irq context, * replacing kfree_skb(skb) * * dev_consume_skb_any(skb) when caller doesn't know its current irq context, * and consumed a packet. Used in place of consume_skb(skb) */ static inline void dev_kfree_skb_irq(struct sk_buff *skb) { dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_irq(struct sk_buff *skb) { dev_kfree_skb_irq_reason(skb, SKB_CONSUMED); } static inline void dev_kfree_skb_any(struct sk_buff *skb) { dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_any(struct sk_buff *skb) { dev_kfree_skb_any_reason(skb, SKB_CONSUMED); } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog); void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list_internal(struct list_head *head); void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) { kfree_skb(napi->skb); napi->skb = NULL; } bool netdev_is_rx_handler_busy(struct net_device *dev); int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data); void netdev_rx_handler_unregister(struct net_device *dev); bool dev_valid_name(const char *name); static inline bool is_socket_ioctl_cmd(unsigned int cmd) { return _IOC_TYPE(cmd) == SOCK_IOC_TYPE; } int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg); int put_user_ifreq(struct ifreq *ifr, void __user *arg); int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, void __user *data, bool *need_copyout); int dev_ifconf(struct net *net, struct ifconf __user *ifc); int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg); int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack); int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, int new_ifindex); static inline int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { return __dev_change_net_namespace(dev, net, pat, 0); } int __dev_set_mtu(struct net_device *, int); int dev_set_mtu(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb); static __always_inline bool __is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb, const bool check_mtu) { const u32 vlan_hdr_len = 4; /* VLAN_HLEN */ unsigned int len; if (!(dev->flags & IFF_UP)) return false; if (!check_mtu) return true; len = dev->mtu + dev->hard_header_len + vlan_hdr_len; if (skb->len <= len) return true; /* if TSO is enabled, we don't care about the length as the packet * could be forwarded without being segmented before */ if (skb_is_gso(skb)) return true; return false; } void netdev_core_stats_inc(struct net_device *dev, u32 offset); #define DEV_CORE_STATS_INC(FIELD) \ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ { \ netdev_core_stats_inc(dev, \ offsetof(struct net_device_core_stats, FIELD)); \ } DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) DEV_CORE_STATS_INC(rx_nohandler) DEV_CORE_STATS_INC(rx_otherhost_dropped) #undef DEV_CORE_STATS_INC static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb, const bool check_mtu) { if (skb_orphan_frags(skb, GFP_ATOMIC) || unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) { dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); return NET_RX_DROP; } skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev))); skb->priority = 0; return 0; } bool dev_nit_active(struct net_device *dev); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); static inline void __dev_put(struct net_device *dev) { if (dev) { #ifdef CONFIG_PCPU_DEV_REFCNT this_cpu_dec(*dev->pcpu_refcnt); #else refcount_dec(&dev->dev_refcnt); #endif } } static inline void __dev_hold(struct net_device *dev) { if (dev) { #ifdef CONFIG_PCPU_DEV_REFCNT this_cpu_inc(*dev->pcpu_refcnt); #else refcount_inc(&dev->dev_refcnt); #endif } } static inline void __netdev_tracker_alloc(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); #endif } /* netdev_tracker_alloc() can upgrade a prior untracked reference * taken by dev_get_by_name()/dev_get_by_index() to a tracked one. */ static inline void netdev_tracker_alloc(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER refcount_dec(&dev->refcnt_tracker.no_tracker); __netdev_tracker_alloc(dev, tracker, gfp); #endif } static inline void netdev_tracker_free(struct net_device *dev, netdevice_tracker *tracker) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER ref_tracker_free(&dev->refcnt_tracker, tracker); #endif } static inline void netdev_hold(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { if (dev) { __dev_hold(dev); __netdev_tracker_alloc(dev, tracker, gfp); } } static inline void netdev_put(struct net_device *dev, netdevice_tracker *tracker) { if (dev) { netdev_tracker_free(dev, tracker); __dev_put(dev); } } /** * dev_hold - get reference to device * @dev: network device * * Hold reference to device to keep it from being freed. * Try using netdev_hold() instead. */ static inline void dev_hold(struct net_device *dev) { netdev_hold(dev, NULL, GFP_ATOMIC); } /** * dev_put - release reference to device * @dev: network device * * Release reference to device to allow it to be freed. * Try using netdev_put() instead. */ static inline void dev_put(struct net_device *dev) { netdev_put(dev, NULL); } DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T)) static inline void netdev_ref_replace(struct net_device *odev, struct net_device *ndev, netdevice_tracker *tracker, gfp_t gfp) { if (odev) netdev_tracker_free(odev, tracker); __dev_hold(ndev); __dev_put(odev); if (ndev) __netdev_tracker_alloc(ndev, tracker, gfp); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. * * The name carrier is inappropriate, these functions should really be * called netif_lowerlayer_*() because they represent the state of any * kind of lower layer not just hardware media. */ void linkwatch_fire_event(struct net_device *dev); /** * linkwatch_sync_dev - sync linkwatch for the given device * @dev: network device to sync linkwatch for * * Sync linkwatch for the given device, removing it from the * pending work list (if queued). */ void linkwatch_sync_dev(struct net_device *dev); /** * netif_carrier_ok - test if carrier present * @dev: network device * * Check if carrier is present on device */ static inline bool netif_carrier_ok(const struct net_device *dev) { return !test_bit(__LINK_STATE_NOCARRIER, &dev->state); } unsigned long dev_trans_start(struct net_device *dev); void __netdev_watchdog_up(struct net_device *dev); void netif_carrier_on(struct net_device *dev); void netif_carrier_off(struct net_device *dev); void netif_carrier_event(struct net_device *dev); /** * netif_dormant_on - mark device as dormant. * @dev: network device * * Mark device as dormant (as per RFC2863). * * The dormant state indicates that the relevant interface is not * actually in a condition to pass packets (i.e., it is not 'up') but is * in a "pending" state, waiting for some external event. For "on- * demand" interfaces, this new state identifies the situation where the * interface is waiting for events to place it in the up state. */ static inline void netif_dormant_on(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state)) linkwatch_fire_event(dev); } /** * netif_dormant_off - set device as not dormant. * @dev: network device * * Device is not in dormant state. */ static inline void netif_dormant_off(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state)) linkwatch_fire_event(dev); } /** * netif_dormant - test if device is dormant * @dev: network device * * Check if device is dormant. */ static inline bool netif_dormant(const struct net_device *dev) { return test_bit(__LINK_STATE_DORMANT, &dev->state); } /** * netif_testing_on - mark device as under test. * @dev: network device * * Mark device as under test (as per RFC2863). * * The testing state indicates that some test(s) must be performed on * the interface. After completion, of the test, the interface state * will change to up, dormant, or down, as appropriate. */ static inline void netif_testing_on(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state)) linkwatch_fire_event(dev); } /** * netif_testing_off - set device as not under test. * @dev: network device * * Device is not in testing state. */ static inline void netif_testing_off(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state)) linkwatch_fire_event(dev); } /** * netif_testing - test if device is under test * @dev: network device * * Check if device is under test */ static inline bool netif_testing(const struct net_device *dev) { return test_bit(__LINK_STATE_TESTING, &dev->state); } /** * netif_oper_up - test if device is operational * @dev: network device * * Check if carrier is operational */ static inline bool netif_oper_up(const struct net_device *dev) { unsigned int operstate = READ_ONCE(dev->operstate); return operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN /* backward compat */; } /** * netif_device_present - is device available or removed * @dev: network device * * Check if device has not been removed from system. */ static inline bool netif_device_present(const struct net_device *dev) { return test_bit(__LINK_STATE_PRESENT, &dev->state); } void netif_device_detach(struct net_device *dev); void netif_device_attach(struct net_device *dev); /* * Network interface message level settings */ enum { NETIF_MSG_DRV_BIT, NETIF_MSG_PROBE_BIT, NETIF_MSG_LINK_BIT, NETIF_MSG_TIMER_BIT, NETIF_MSG_IFDOWN_BIT, NETIF_MSG_IFUP_BIT, NETIF_MSG_RX_ERR_BIT, NETIF_MSG_TX_ERR_BIT, NETIF_MSG_TX_QUEUED_BIT, NETIF_MSG_INTR_BIT, NETIF_MSG_TX_DONE_BIT, NETIF_MSG_RX_STATUS_BIT, NETIF_MSG_PKTDATA_BIT, NETIF_MSG_HW_BIT, NETIF_MSG_WOL_BIT, /* When you add a new bit above, update netif_msg_class_names array * in net/ethtool/common.c */ NETIF_MSG_CLASS_COUNT, }; /* Both ethtool_ops interface and internal driver implementation use u32 */ static_assert(NETIF_MSG_CLASS_COUNT <= 32); #define __NETIF_MSG_BIT(bit) ((u32)1 << (bit)) #define __NETIF_MSG(name) __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT) #define NETIF_MSG_DRV __NETIF_MSG(DRV) #define NETIF_MSG_PROBE __NETIF_MSG(PROBE) #define NETIF_MSG_LINK __NETIF_MSG(LINK) #define NETIF_MSG_TIMER __NETIF_MSG(TIMER) #define NETIF_MSG_IFDOWN __NETIF_MSG(IFDOWN) #define NETIF_MSG_IFUP __NETIF_MSG(IFUP) #define NETIF_MSG_RX_ERR __NETIF_MSG(RX_ERR) #define NETIF_MSG_TX_ERR __NETIF_MSG(TX_ERR) #define NETIF_MSG_TX_QUEUED __NETIF_MSG(TX_QUEUED) #define NETIF_MSG_INTR __NETIF_MSG(INTR) #define NETIF_MSG_TX_DONE __NETIF_MSG(TX_DONE) #define NETIF_MSG_RX_STATUS __NETIF_MSG(RX_STATUS) #define NETIF_MSG_PKTDATA __NETIF_MSG(PKTDATA) #define NETIF_MSG_HW __NETIF_MSG(HW) #define NETIF_MSG_WOL __NETIF_MSG(WOL) #define netif_msg_drv(p) ((p)->msg_enable & NETIF_MSG_DRV) #define netif_msg_probe(p) ((p)->msg_enable & NETIF_MSG_PROBE) #define netif_msg_link(p) ((p)->msg_enable & NETIF_MSG_LINK) #define netif_msg_timer(p) ((p)->msg_enable & NETIF_MSG_TIMER) #define netif_msg_ifdown(p) ((p)->msg_enable & NETIF_MSG_IFDOWN) #define netif_msg_ifup(p) ((p)->msg_enable & NETIF_MSG_IFUP) #define netif_msg_rx_err(p) ((p)->msg_enable & NETIF_MSG_RX_ERR) #define netif_msg_tx_err(p) ((p)->msg_enable & NETIF_MSG_TX_ERR) #define netif_msg_tx_queued(p) ((p)->msg_enable & NETIF_MSG_TX_QUEUED) #define netif_msg_intr(p) ((p)->msg_enable & NETIF_MSG_INTR) #define netif_msg_tx_done(p) ((p)->msg_enable & NETIF_MSG_TX_DONE) #define netif_msg_rx_status(p) ((p)->msg_enable & NETIF_MSG_RX_STATUS) #define netif_msg_pktdata(p) ((p)->msg_enable & NETIF_MSG_PKTDATA) #define netif_msg_hw(p) ((p)->msg_enable & NETIF_MSG_HW) #define netif_msg_wol(p) ((p)->msg_enable & NETIF_MSG_WOL) static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) { /* use default */ if (debug_value < 0 || debug_value >= (sizeof(u32) * 8)) return default_msg_enable_bits; if (debug_value == 0) /* no output */ return 0; /* set low N bits */ return (1U << debug_value) - 1; } static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) { __acquire(&txq->_xmit_lock); return true; } static inline void __netif_tx_release(struct netdev_queue *txq) { __release(&txq->_xmit_lock); } static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) { /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { /* Pairs with READ_ONCE() in __dev_queue_xmit() */ WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } /* * txq->trans_start can be read locklessly from dev_watchdog() */ static inline void txq_trans_update(struct netdev_queue *txq) { if (txq->xmit_lock_owner != -1) WRITE_ONCE(txq->trans_start, jiffies); } static inline void txq_trans_cond_update(struct netdev_queue *txq) { unsigned long now = jiffies; if (READ_ONCE(txq->trans_start) != now) WRITE_ONCE(txq->trans_start, now); } /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */ static inline void netif_trans_update(struct net_device *dev) { struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); txq_trans_cond_update(txq); } /** * netif_tx_lock - grab network device transmit lock * @dev: network device * * Get network device transmit lock */ void netif_tx_lock(struct net_device *dev); static inline void netif_tx_lock_bh(struct net_device *dev) { local_bh_disable(); netif_tx_lock(dev); } void netif_tx_unlock(struct net_device *dev); static inline void netif_tx_unlock_bh(struct net_device *dev) { netif_tx_unlock(dev); local_bh_enable(); } #define HARD_TX_LOCK(dev, txq, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ __netif_tx_lock(txq, cpu); \ } else { \ __netif_tx_acquire(txq); \ } \ } #define HARD_TX_TRYLOCK(dev, txq) \ (((dev->features & NETIF_F_LLTX) == 0) ? \ __netif_tx_trylock(txq) : \ __netif_tx_acquire(txq)) #define HARD_TX_UNLOCK(dev, txq) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ __netif_tx_unlock(txq); \ } else { \ __netif_tx_release(txq); \ } \ } static inline void netif_tx_disable(struct net_device *dev) { unsigned int i; int cpu; local_bh_disable(); cpu = smp_processor_id(); spin_lock(&dev->tx_global_lock); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); __netif_tx_lock(txq, cpu); netif_tx_stop_queue(txq); __netif_tx_unlock(txq); } spin_unlock(&dev->tx_global_lock); local_bh_enable(); } static inline void netif_addr_lock(struct net_device *dev) { unsigned char nest_level = 0; #ifdef CONFIG_LOCKDEP nest_level = dev->nested_level; #endif spin_lock_nested(&dev->addr_list_lock, nest_level); } static inline void netif_addr_lock_bh(struct net_device *dev) { unsigned char nest_level = 0; #ifdef CONFIG_LOCKDEP nest_level = dev->nested_level; #endif local_bh_disable(); spin_lock_nested(&dev->addr_list_lock, nest_level); } static inline void netif_addr_unlock(struct net_device *dev) { spin_unlock(&dev->addr_list_lock); } static inline void netif_addr_unlock_bh(struct net_device *dev) { spin_unlock_bh(&dev->addr_list_lock); } /* * dev_addrs walker. Should be used only for read access. Call with * rcu_read_lock held. */ #define for_each_dev_addr(dev, ha) \ list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list) /* These functions live elsewhere (drivers/net/net_init.c, but related) */ void ether_setup(struct net_device *dev); /* Allocate dummy net_device */ struct net_device *alloc_netdev_dummy(int sizeof_priv); /* Support for loadable net-drivers */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs); #define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \ alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1) #define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \ alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \ count) int register_netdev(struct net_device *dev); void unregister_netdev(struct net_device *dev); int devm_register_netdev(struct device *dev, struct net_device *ndev); /* General hardware address lists handling functions */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)); int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *, int), int (*unsync)(struct net_device *, const unsigned char *, int)); void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *, int)); void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)); void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ void dev_addr_mod(struct net_device *dev, unsigned int offset, const void *addr, size_t len); static inline void __dev_addr_set(struct net_device *dev, const void *addr, size_t len) { dev_addr_mod(dev, 0, addr, len); } static inline void dev_addr_set(struct net_device *dev, const u8 *addr) { __dev_addr_set(dev, addr, dev->addr_len); } int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr); int dev_uc_del(struct net_device *dev, const unsigned char *addr); int dev_uc_sync(struct net_device *to, struct net_device *from); int dev_uc_sync_multiple(struct net_device *to, struct net_device *from); void dev_uc_unsync(struct net_device *to, struct net_device *from); void dev_uc_flush(struct net_device *dev); void dev_uc_init(struct net_device *dev); /** * __dev_uc_sync - Synchonize device's unicast list * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed * * Add newly added addresses to the interface, and release * addresses that have been deleted. */ static inline int __dev_uc_sync(struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)) { return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync); } /** * __dev_uc_unsync - Remove synchronized addresses from device * @dev: device to sync * @unsync: function to call if address should be removed * * Remove all addresses that were added to the device by dev_uc_sync(). */ static inline void __dev_uc_unsync(struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)) { __hw_addr_unsync_dev(&dev->uc, dev, unsync); } /* Functions used for multicast addresses handling */ int dev_mc_add(struct net_device *dev, const unsigned char *addr); int dev_mc_add_global(struct net_device *dev, const unsigned char *addr); int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr); int dev_mc_del(struct net_device *dev, const unsigned char *addr); int dev_mc_del_global(struct net_device *dev, const unsigned char *addr); int dev_mc_sync(struct net_device *to, struct net_device *from); int dev_mc_sync_multiple(struct net_device *to, struct net_device *from); void dev_mc_unsync(struct net_device *to, struct net_device *from); void dev_mc_flush(struct net_device *dev); void dev_mc_init(struct net_device *dev); /** * __dev_mc_sync - Synchonize device's multicast list * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed * * Add newly added addresses to the interface, and release * addresses that have been deleted. */ static inline int __dev_mc_sync(struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)) { return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync); } /** * __dev_mc_unsync - Remove synchronized addresses from device * @dev: device to sync * @unsync: function to call if address should be removed * * Remove all addresses that were added to the device by dev_mc_sync(). */ static inline void __dev_mc_unsync(struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)) { __hw_addr_unsync_dev(&dev->mc, dev, unsync); } /* Functions used for secondary unicast and multicast support */ void dev_set_rx_mode(struct net_device *dev); int dev_set_promiscuity(struct net_device *dev, int inc); int dev_set_allmulti(struct net_device *dev, int inc); void netdev_state_change(struct net_device *dev); void __netdev_notify_peers(struct net_device *dev); void netdev_notify_peers(struct net_device *dev); void netdev_features_change(struct net_device *dev); /* Load a device via the kmod */ void dev_load(struct net *net, const char *name); struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage); void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats); void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); enum { NESTED_SYNC_IMM_BIT, NESTED_SYNC_TODO_BIT, }; #define __NESTED_SYNC_BIT(bit) ((u32)1 << (bit)) #define __NESTED_SYNC(name) __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT) #define NESTED_SYNC_IMM __NESTED_SYNC(IMM) #define NESTED_SYNC_TODO __NESTED_SYNC(TODO) struct netdev_nested_priv { unsigned char flags; void *data; }; bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); /* iterate through upper list, must be called under RCU read lock */ #define netdev_for_each_upper_dev_rcu(dev, updev, iter) \ for (iter = &(dev)->adj_list.upper, \ updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \ updev; \ updev = netdev_upper_get_next_dev_rcu(dev, &(iter))) int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *upper_dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv); bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev); bool netdev_has_any_upper_dev(struct net_device *dev); void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter); void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter); #define netdev_for_each_lower_private(dev, priv, iter) \ for (iter = (dev)->adj_list.lower.next, \ priv = netdev_lower_get_next_private(dev, &(iter)); \ priv; \ priv = netdev_lower_get_next_private(dev, &(iter))) #define netdev_for_each_lower_private_rcu(dev, priv, iter) \ for (iter = &(dev)->adj_list.lower, \ priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \ priv; \ priv = netdev_lower_get_next_private_rcu(dev, &(iter))) void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter); #define netdev_for_each_lower_dev(dev, ldev, iter) \ for (iter = (dev)->adj_list.lower.next, \ ldev = netdev_lower_get_next(dev, &(iter)); \ ldev; \ ldev = netdev_lower_get_next(dev, &(iter))) struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter); int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *lower_dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv); int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *lower_dev, struct netdev_nested_priv *priv), struct netdev_nested_priv *priv); void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); struct net_device *netdev_master_upper_dev_get(struct net_device *dev); struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev); int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev); int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev, struct netlink_ext_ack *extack); void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev); void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev); void netdev_adjacent_rename_links(struct net_device *dev, char *oldname); void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev); void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info); /* RSS keys are 40 or 52 bytes long */ #define NETDEV_RSS_KEY_LEN 52 extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len); int skb_checksum_help(struct sk_buff *skb); int skb_crc32c_csum_help(struct sk_buff *skb); int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features); struct netdev_bonding_info { ifslave slave; ifbond master; }; struct netdev_notifier_bonding_info { struct netdev_notifier_info info; /* must be first */ struct netdev_bonding_info bonding_info; }; void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info); #if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data); #else static inline void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) { } #endif __be16 skb_network_protocol(struct sk_buff *skb, int *depth); static inline bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { if (protocol == htons(ETH_P_FCOE)) return !!(features & NETIF_F_FCOE_CRC); /* Assume this is an IP checksum (not SCTP CRC) */ if (features & NETIF_F_HW_CSUM) { /* Can checksum everything */ return true; } switch (protocol) { case htons(ETH_P_IP): return !!(features & NETIF_F_IP_CSUM); case htons(ETH_P_IPV6): return !!(features & NETIF_F_IPV6_CSUM); default: return false; } } #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb); #else static inline void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { } #endif /* rx skb timestamps */ void net_enable_timestamp(void); void net_disable_timestamp(void); static inline ktime_t netdev_get_tstamp(struct net_device *dev, const struct skb_shared_hwtstamps *hwtstamps, bool cycles) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_get_tstamp) return ops->ndo_get_tstamp(dev, hwtstamps, cycles); return hwtstamps->hwtstamp; } #ifndef CONFIG_PREEMPT_RT static inline void netdev_xmit_set_more(bool more) { __this_cpu_write(softnet_data.xmit.more, more); } static inline bool netdev_xmit_more(void) { return __this_cpu_read(softnet_data.xmit.more); } #else static inline void netdev_xmit_set_more(bool more) { current->net_xmit.more = more; } static inline bool netdev_xmit_more(void) { return current->net_xmit.more; } #endif static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) { netdev_xmit_set_more(more); return ops->ndo_start_xmit(skb, dev); } static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { const struct net_device_ops *ops = dev->netdev_ops; netdev_tx_t rc; rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) txq_trans_update(txq); return rc; } int netdev_class_create_file_ns(const struct class_attribute *class_attr, const void *ns); void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns); extern const struct kobj_ns_type_operations net_ns_type_operations; const char *netdev_drivername(const struct net_device *dev); static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { if ((f1 ^ f2) & NETIF_F_HW_CSUM) { if (f1 & NETIF_F_HW_CSUM) f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); else f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } return f1 & f2; } static inline netdev_features_t netdev_get_wanted_features( struct net_device *dev) { return (dev->features & ~dev->hw_features) | dev->wanted_features; } netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask); /* Allow TSO being used on stacked device : * Performing the GSO segmentation before last device * is a performance improvement. */ static inline netdev_features_t netdev_add_tso_features(netdev_features_t features, netdev_features_t mask) { return netdev_increment_features(features, NETIF_F_ALL_TSO, mask); } int __netdev_update_features(struct net_device *dev); void netdev_update_features(struct net_device *dev); void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features); netdev_features_t netif_skb_features(struct sk_buff *skb); void skb_warn_bad_offload(const struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT; /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_IPXIP4 != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_IPXIP6 != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features) { return net_gso_ok(features, skb_shinfo(skb)->gso_type) && (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST)); } static inline bool netif_needs_gso(struct sk_buff *skb, netdev_features_t features) { return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || unlikely((skb->ip_summed != CHECKSUM_PARTIAL) && (skb->ip_summed != CHECKSUM_UNNECESSARY))); } void netif_set_tso_max_size(struct net_device *dev, unsigned int size); void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, const struct net_device *from); static inline bool netif_is_macsec(const struct net_device *dev) { return dev->priv_flags & IFF_MACSEC; } static inline bool netif_is_macvlan(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN; } static inline bool netif_is_macvlan_port(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN_PORT; } static inline bool netif_is_bond_master(const struct net_device *dev) { return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING; } static inline bool netif_is_bond_slave(const struct net_device *dev) { return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING; } static inline bool netif_supports_nofcs(struct net_device *dev) { return dev->priv_flags & IFF_SUPP_NOFCS; } static inline bool netif_has_l3_rx_handler(const struct net_device *dev) { return dev->priv_flags & IFF_L3MDEV_RX_HANDLER; } static inline bool netif_is_l3_master(const struct net_device *dev) { return dev->priv_flags & IFF_L3MDEV_MASTER; } static inline bool netif_is_l3_slave(const struct net_device *dev) { return dev->priv_flags & IFF_L3MDEV_SLAVE; } static inline int dev_sdif(const struct net_device *dev) { #ifdef CONFIG_NET_L3_MASTER_DEV if (netif_is_l3_slave(dev)) return dev->ifindex; #endif return 0; } static inline bool netif_is_bridge_master(const struct net_device *dev) { return dev->priv_flags & IFF_EBRIDGE; } static inline bool netif_is_bridge_port(const struct net_device *dev) { return dev->priv_flags & IFF_BRIDGE_PORT; } static inline bool netif_is_ovs_master(const struct net_device *dev) { return dev->priv_flags & IFF_OPENVSWITCH; } static inline bool netif_is_ovs_port(const struct net_device *dev) { return dev->priv_flags & IFF_OVS_DATAPATH; } static inline bool netif_is_any_bridge_master(const struct net_device *dev) { return netif_is_bridge_master(dev) || netif_is_ovs_master(dev); } static inline bool netif_is_any_bridge_port(const struct net_device *dev) { return netif_is_bridge_port(dev) || netif_is_ovs_port(dev); } static inline bool netif_is_team_master(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM; } static inline bool netif_is_team_port(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM_PORT; } static inline bool netif_is_lag_master(const struct net_device *dev) { return netif_is_bond_master(dev) || netif_is_team_master(dev); } static inline bool netif_is_lag_port(const struct net_device *dev) { return netif_is_bond_slave(dev) || netif_is_team_port(dev); } static inline bool netif_is_rxfh_configured(const struct net_device *dev) { return dev->priv_flags & IFF_RXFH_CONFIGURED; } static inline bool netif_is_failover(const struct net_device *dev) { return dev->priv_flags & IFF_FAILOVER; } static inline bool netif_is_failover_slave(const struct net_device *dev) { return dev->priv_flags & IFF_FAILOVER_SLAVE; } /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM); } /* return true if dev can't cope with mtu frames that need vlan tag insertion */ static inline bool netif_reduces_vlan_mtu(struct net_device *dev) { /* TODO: reserve and use an additional IFF bit, if we get more users */ return netif_is_macsec(dev); } extern struct pernet_operations __net_initdata loopback_net_ops; /* Logging, debugging and troubleshooting/diagnostic helpers. */ /* netdev_printk helpers, similar to dev_printk */ static inline const char *netdev_name(const struct net_device *dev) { if (!dev->name[0] || strchr(dev->name, '%')) return "(unnamed net_device)"; return dev->name; } static inline const char *netdev_reg_state(const struct net_device *dev) { u8 reg_state = READ_ONCE(dev->reg_state); switch (reg_state) { case NETREG_UNINITIALIZED: return " (uninitialized)"; case NETREG_REGISTERED: return ""; case NETREG_UNREGISTERING: return " (unregistering)"; case NETREG_UNREGISTERED: return " (unregistered)"; case NETREG_RELEASED: return " (released)"; case NETREG_DUMMY: return " (dummy)"; } WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state); return " (unknown)"; } #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) /* * netdev_WARN() acts like dev_printk(), but with the key difference * of using a WARN/WARN_ON to get the message out, including the * file/line information and a backtrace. */ #define netdev_WARN(dev, format, args...) \ WARN(1, "netdevice: %s%s: " format, netdev_name(dev), \ netdev_reg_state(dev), ##args) #define netdev_WARN_ONCE(dev, format, args...) \ WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev), \ netdev_reg_state(dev), ##args) /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. * * Why 16. Because with 16 the only overlap we get on a hash of the * low nibble of the protocol value is RARP/SNAP/X.25. * * 0800 IP * 0001 802.3 * 0002 AX.25 * 0004 802.2 * 8035 RARP * 0005 SNAP * 0805 X.25 * 0806 ARP * 8137 IPX * 0009 Localtalk * 86DD IPv6 */ #define PTYPE_HASH_SIZE (16) #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; /* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */ #define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD) #define DEV_STATS_ADD(DEV, FIELD, VAL) \ atomic_long_add((VAL), &(DEV)->stats.__##FIELD) #define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD) #endif /* _LINUX_NETDEVICE_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H #include <asm/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) struct module; #define NFT_JUMP_STACK_SIZE 16 enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { struct sk_buff *skb; const struct nf_hook_state *state; u8 flags; u8 tprot; u16 fragoff; u16 thoff; u16 inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; pkt->thoff = 0; pkt->fragoff = 0; } /** * struct nft_verdict - nf_tables verdict * * @code: nf_tables/netfilter verdict code * @chain: destination chain for NFT_JUMP/NFT_GOTO */ struct nft_verdict { u32 code; struct nft_chain *chain; }; struct nft_data { union { u32 data[4]; struct nft_verdict verdict; }; } __attribute__((aligned(__alignof__(u64)))); #define NFT_REG32_NUM 20 /** * struct nft_regs - nf_tables register set * * @data: data registers * @verdict: verdict register * * The first four data registers alias to the verdict register. */ struct nft_regs { union { u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; const struct nft_expr *last; }; /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit * level. So for store instruction, pad the rest part with zero to avoid * garbage values. */ static inline void nft_reg_store8(u32 *dreg, u8 val) { *dreg = 0; *(u8 *)dreg = val; } static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } static inline void nft_reg_store16(u32 *dreg, u16 val) { *dreg = 0; *(u16 *)dreg = val; } static inline void nft_reg_store_be16(u32 *dreg, __be16 val) { nft_reg_store16(dreg, (__force __u16)val); } static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } static inline __be16 nft_reg_load_be16(const u32 *sreg) { return (__force __be16)nft_reg_load16(sreg); } static inline __be32 nft_reg_load_be32(const u32 *sreg) { return *(__force __be32 *)sreg; } static inline void nft_reg_store64(u64 *dreg, u64 val) { put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { if (len % NFT_REG32_SIZE) dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } /** * struct nft_ctx - nf_tables rule/set context * * @net: net namespace * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; u16 flags; u8 family; u8 level; bool report; }; enum nft_data_desc_flags { NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) { return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object * * @len: length of the data * @data: content * * The presence of user data is indicated in an object specific fashion, * so a length of zero can't occur and the value "len" indicates data * of length len + 1. */ struct nft_userdata { u8 len; unsigned char data[]; }; /* placeholder structure for opaque set element backend representation. */ struct nft_elem_priv { }; /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; struct nft_elem_priv *priv; }; static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) { return (void *)priv; } /** * enum nft_iter_type - nftables set iterator type * * @NFT_ITER_READ: read-only iteration over set elements * @NFT_ITER_UPDATE: iteration under mutex to update set element state */ enum nft_iter_type { NFT_ITER_UNSPEC, NFT_ITER_READ, NFT_ITER_UPDATE, }; struct nft_set; struct nft_set_iter { u8 genmask; enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); }; /** * struct nft_set_desc - description of set elements * * @ktype: key type * @klen: key length * @dtype: data type * @dlen: data length * @objtype: object type * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { u32 ktype; unsigned int klen; u32 dtype; unsigned int dlen; u32 objtype; unsigned int size; u32 policy; u32 gc_int; u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; }; /** * enum nft_set_class - performance class * * @NFT_SET_CLASS_O_1: constant, O(1) * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, NFT_SET_CLASS_O_LOG_N, NFT_SET_CLASS_O_N, }; /** * struct nft_set_estimate - estimation of memory and performance * characteristics * * @size: required memory * @lookup: lookup performance class * @space: memory class */ struct nft_set_estimate { u64 size; enum nft_set_class lookup; enum nft_set_class space; }; #define NFT_EXPR_MAXATTR 16 #define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ ALIGN(size, __alignof__(struct nft_expr))) /** * struct nft_expr - nf_tables expression * * @ops: expression ops * @data: expression private data */ struct nft_expr { const struct nft_expr_ops *ops; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) { return (void *)expr->data; } struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); struct nft_set_ext; /** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set * @update: update an element if exists, add it if doesn't exist * @delete: delete an element * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements * @commit: commit set elements * @abort: abort set elements * @privsize: function to return size of set private data * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster * and currently only used in the packet path. All the rest are slower, * control plane functions. */ struct nft_set_ops { bool (*lookup)(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, struct nft_elem_priv * (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs, const struct nft_set_ext **ext); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **priv); void (*activate)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*flush)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_ctx *ctx, const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; /** * struct nft_set_type - nf_tables set type * * @ops: set ops for this type * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) struct nft_set_elem_expr { u8 size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; #define nft_setelem_expr_at(__elem_expr, __offset) \ ((struct nft_expr *)&__elem_expr->data[__offset]) #define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ __size < (__elem_expr)->size; \ __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) #define NFT_SET_EXPR_MAX 2 /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal * @timeout: default timeout value in jiffies * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length * @num_exprs: numbers of exprs * @exprs: stateful expression * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; refcount_t refs; struct nft_table *table; possible_net_t net; char *name; u64 handle; u32 ktype; u32 dtype; u32 objtype; u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; u32 use; atomic_t nelems; u32 ndeact; u64 timeout; u32 gc_int; u16 policy; u16 udlen; unsigned char *udata; struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:13, dead:1, genmask:2; u8 klen; u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline bool nft_set_is_anonymous(const struct nft_set *set) { return set->flags & NFT_SET_ANONYMOUS; } static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; } static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) { return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; } static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask); struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { u32 gc_int = READ_ONCE(set->gc_int); return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** * struct nft_set_binding - nf_tables set binding * * @list: set bindings list node * @chain: chain containing the rule bound to the set * @flags: set action flags * * A set binding contains all information necessary for validation * of new elements added to a bound set. */ struct nft_set_binding { struct list_head list; const struct nft_chain *chain; u32 flags; }; enum nft_trans_phase; void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPRESSIONS: expressions assiciated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; /** * struct nft_set_ext_type - set extension type * * @len: fixed part length of the extension * @align: alignment requirements of the extension */ struct nft_set_ext_type { u8 len; u8 align; }; extern const struct nft_set_ext_type nft_set_ext_types[]; /** * struct nft_set_ext_tmpl - set extension template * * @len: length of extension area * @offset: offsets of individual extension types * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; u8 ext_len[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * * @genmask: generation mask * @offset: offsets of individual extension types * @data: beginning of extension data */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; }; static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { memset(tmpl, 0, sizeof(*tmpl)); tmpl->len = sizeof(struct nft_set_ext); } static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); if (tmpl->len > U8_MAX) return -EINVAL; tmpl->offset[id] = tmpl->len; tmpl->ext_len[id] = nft_set_ext_types[id].len + len; tmpl->len += tmpl->ext_len[id]; return 0; } static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, const struct nft_set_ext_tmpl *tmpl) { memcpy(ext->offset, tmpl->offset, sizeof(ext->offset)); } static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return ext && __nft_set_ext_exists(ext, id); } static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id) { return (void *)ext + ext->offset[id]; } static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY); } static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY_END); } static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); } static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, u64 tstamp) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && time_after_eq64(tstamp, *nft_set_ext_expiration(ext)); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, const struct nft_elem_priv *elem_priv) { return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type * * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number * @family: address family for AF-specific types * @flags: expression type flags */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; const struct nla_policy *policy; unsigned int maxattr; u8 family; u8 flags; }; #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 enum nft_trans_phase { NFT_TRANS_PREPARE, NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE }; struct nft_flow_rule; struct nft_offload_ctx; /** * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow * offload array * @offload_stats: function to synchronize hardware stats via updating the counter expression * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); bool (*reduce)(struct nft_regs_track *track, const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; /** * struct nft_rule - nf_tables rule * * @list: used internally * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { struct list_head list; u64 handle:42, genmask:2, dlen:12, udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; } static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr) { return ((void *)expr) + expr->ops->size; } static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[rule->dlen]; } static inline bool nft_expr_more(const struct nft_rule *rule, const struct nft_expr *expr) { return expr != nft_expr_last(rule) && expr->ops; } static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase); void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; u32 size; if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) { expr->ops->eval(expr, regs, pkt); if (regs->verdict.code == NFT_BREAK) return; } } } /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it * can't assume that the dlen value wasn't changed within calls in the loop. */ #define nft_rule_for_each_expr(expr, last, rule) \ for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \ (expr) != (last); \ (expr) = nft_expr_next(expr)) #define NFT_CHAIN_POLICY_UNSET U8_MAX struct nft_rule_dp { u64 is_last:1, dlen:12, handle:42; /* for tracing */ unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; struct nft_rule_dp_last { struct nft_rule_dp end; /* end of nft_rule_blob marker */ struct rcu_head h; /* call_rcu head */ struct nft_rule_blob *blob; /* ptr to free via call_rcu */ const struct nft_chain *chain; /* for nftables tracing */ }; static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule) { return (void *)rule + sizeof(*rule) + rule->dlen; } struct nft_rule_blob { unsigned long size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_rule_dp)))); }; /** * struct nft_chain - nf_tables chain * * @blob_gen_0: rule blob pointer to the current generation * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain * @flags: bitmask of enum NFTA_CHAIN_FLAGS * @bound: bind or not * @genmask: generation mask * @name: name of the chain * @udlen: user data length * @udata: user data in the chain * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; u8 flags:5, bound:1, genmask:2; char *name; u16 udlen; u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, NFT_CHAIN_T_MAX }; /** * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions * @ops_register: base chain register function * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); static inline bool nft_chain_binding(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BINDING; } static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_chain *chain); struct nft_stats { u64 bytes; u64 pkts; struct u64_stats_sync syncp; }; struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; }; /** * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) { return container_of(chain, struct nft_base_chain, chain); } static inline bool nft_is_base_chain(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) { if (*use == UINT_MAX) return false; (*use)++; return true; } static inline void nft_use_dec(u32 *use) { WARN_ON_ONCE((*use)-- == 0); } /* For error and abort path: restore use counter to previous state. */ static inline void nft_use_inc_restore(u32 *use) { WARN_ON_ONCE(!nft_use_inc(use)); } #define nft_use_dec_restore nft_use_dec /** * struct nft_table - nf_tables table * * @list: used internally * @chains_ht: chains in the table * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table * @family:address family * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask * @nlpid: netlink port ID * @name: name of the table * @udlen: length of the user data * @udata: user data * @validate_state: internal, set when transaction adds jumps */ struct nft_table { struct list_head list; struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; struct list_head flowtables; u64 hgenerator; u64 handle; u32 use; u16 family:6, flags:8, genmask:2; u32 nlpid; char *name; u16 udlen; u8 *udata; u8 validate_state; }; static inline bool nft_table_has_owner(const struct nft_table *table) { return table->flags & NFT_TABLE_F_OWNER; } static inline bool nft_table_is_orphan(const struct nft_table *table) { return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) == NFT_TABLE_F_PERSIST; } static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); } void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); /** * struct nft_object_hash_key - key to lookup nft_object * * @name: name of the stateful object to look up * @table: table the object belongs to */ struct nft_object_hash_key { const char *name; const struct nft_table *table; }; /** * struct nft_object - nf_tables stateful object * * @list: table stateful object list node * @rhlhead: nft_objname_ht node * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle * @udlen: length of user data * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; u32 genmask:2; u32 use; u64 handle; u16 udlen; u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_obj_data(const struct nft_object *obj) { return (void *)obj->data; } #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type * * @select_ops: function to select nft_object_ops * @ops: default ops, used when no select_ops functions is present * @list: list node in list of object types * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { const struct nft_object_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); const struct nft_object_ops *ops; struct list_head list; u32 type; unsigned int maxattr; u8 family; struct module *owner; const struct nla_policy *policy; }; /** * struct nft_object_ops - stateful object operations * * @eval: stateful object evaluation function * @size: stateful object size * @init: initialize object from netlink attributes * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(const struct nft_ctx *ctx, struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); void (*update)(struct nft_object *obj, struct nft_object *newobj); const struct nft_object_type *type; }; int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); #define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table * * @list: flow table list node in table list * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector */ struct nft_flowtable { struct list_head list; struct nft_table *table; char *name; int hooknum; int ops_len; u32 genmask:2; u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * * @trace: other struct members are initialised * @nf_trace: copy of skb->nf_trace before rule evaluation * @type: event type (enum nft_trace_types) * @skbid: hash of skb to be used as trace id * @packet_dumped: packet headers sent in a previous traceinfo message * @basechain: base chain currently processed */ struct nft_traceinfo { bool trace; bool nf_trace; bool packet_dumped; enum nft_trace_types type:8; u32 skbid; const struct nft_base_chain *basechain; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_chain *basechain); void nft_trace_notify(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_rule_dp *rule, struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) #if IS_ENABLED(CONFIG_NF_TABLES) /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations * they're active in. A set bit means they're inactive in the generation * represented by that bit. * * New objects start out as inactive in the current and active in the * next generation. When committing the ruleset the bitmask is cleared, * meaning they're active in all generations. When removing an object, * it is set inactive in the next generation. After committing the ruleset, * the objects are removed. */ static inline unsigned int nft_gencursor_next(const struct net *net) { return net->nft.gencursor + 1 == 1 ? 1 : 0; } static inline u8 nft_genmask_next(const struct net *net) { return 1 << nft_gencursor_next(net); } static inline u8 nft_genmask_cur(const struct net *net) { /* Use READ_ONCE() to prevent refetching the value for atomicity */ return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) /* * Generic transaction helpers */ /* Check if this object is currently active. */ #define nft_is_active(__net, __obj) \ (((__obj)->genmask & nft_genmask_cur(__net)) == 0) /* Check if this object is active in the next generation. */ #define nft_is_active_next(__net, __obj) \ (((__obj)->genmask & nft_genmask_next(__net)) == 0) /* This object becomes active in the next generation. */ #define nft_activate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_cur(__net) /* This object becomes inactive in the next generation. */ #define nft_deactivate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_next(__net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) #define nft_active_genmask(__obj, __genmask) \ !((__obj)->genmask & __genmask) /* * Set element transaction helpers */ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, u8 genmask) { return !(ext->genmask & genmask); } static inline void nft_set_elem_change_active(const struct net *net, const struct nft_set *set, struct nft_set_ext *ext) { ext->genmask ^= nft_genmask_next(net); } #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); set_bit(NFT_SET_ELEM_DEAD_BIT, word); } static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** * struct nft_trans - nf_tables object update in transaction * * @list: used internally * @net: struct net * @table: struct nft_table the object resides in * @msg_type: message type * @seq: netlink sequence number * @flags: modifiers to new request * @report: notify via unicast netlink message * @put_net: net needs to be put * * This is the information common to all objects in the transaction, * this must always be the first member of derived sub-types. */ struct nft_trans { struct list_head list; struct net *net; struct nft_table *table; int msg_type; u32 seq; u16 flags; u8 report:1; u8 put_net:1; }; /** * struct nft_trans_binding - nf_tables object with binding support in transaction * @nft_trans: base structure, MUST be first member * @binding_list: list of objects with possible bindings * * This is the base type used by objects that can be bound to a chain. */ struct nft_trans_binding { struct nft_trans nft_trans; struct list_head binding_list; }; struct nft_trans_rule { struct nft_trans nft_trans; struct nft_rule *rule; struct nft_chain *chain; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; #define nft_trans_container_rule(trans) \ container_of(trans, struct nft_trans_rule, nft_trans) #define nft_trans_rule(trans) \ nft_trans_container_rule(trans)->rule #define nft_trans_flow_rule(trans) \ nft_trans_container_rule(trans)->flow #define nft_trans_rule_id(trans) \ nft_trans_container_rule(trans)->rule_id #define nft_trans_rule_bound(trans) \ nft_trans_container_rule(trans)->bound #define nft_trans_rule_chain(trans) \ nft_trans_container_rule(trans)->chain struct nft_trans_set { struct nft_trans_binding nft_trans_binding; struct nft_set *set; u32 set_id; u32 gc_int; u64 timeout; bool update; bool bound; u32 size; }; #define nft_trans_container_set(t) \ container_of(t, struct nft_trans_set, nft_trans_binding.nft_trans) #define nft_trans_set(trans) \ nft_trans_container_set(trans)->set #define nft_trans_set_id(trans) \ nft_trans_container_set(trans)->set_id #define nft_trans_set_bound(trans) \ nft_trans_container_set(trans)->bound #define nft_trans_set_update(trans) \ nft_trans_container_set(trans)->update #define nft_trans_set_timeout(trans) \ nft_trans_container_set(trans)->timeout #define nft_trans_set_gc_int(trans) \ nft_trans_container_set(trans)->gc_int #define nft_trans_set_size(trans) \ nft_trans_container_set(trans)->size struct nft_trans_chain { struct nft_trans_binding nft_trans_binding; struct nft_chain *chain; char *name; struct nft_stats __percpu *stats; u8 policy; bool update; bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; #define nft_trans_container_chain(t) \ container_of(t, struct nft_trans_chain, nft_trans_binding.nft_trans) #define nft_trans_chain(trans) \ nft_trans_container_chain(trans)->chain #define nft_trans_chain_update(trans) \ nft_trans_container_chain(trans)->update #define nft_trans_chain_name(trans) \ nft_trans_container_chain(trans)->name #define nft_trans_chain_stats(trans) \ nft_trans_container_chain(trans)->stats #define nft_trans_chain_policy(trans) \ nft_trans_container_chain(trans)->policy #define nft_trans_chain_bound(trans) \ nft_trans_container_chain(trans)->bound #define nft_trans_chain_id(trans) \ nft_trans_container_chain(trans)->chain_id #define nft_trans_basechain(trans) \ nft_trans_container_chain(trans)->basechain #define nft_trans_chain_hooks(trans) \ nft_trans_container_chain(trans)->hook_list struct nft_trans_table { struct nft_trans nft_trans; bool update; }; #define nft_trans_container_table(trans) \ container_of(trans, struct nft_trans_table, nft_trans) #define nft_trans_table_update(trans) \ nft_trans_container_table(trans)->update struct nft_trans_elem { struct nft_trans nft_trans; struct nft_set *set; struct nft_elem_priv *elem_priv; bool bound; }; #define nft_trans_container_elem(t) \ container_of(t, struct nft_trans_elem, nft_trans) #define nft_trans_elem_set(trans) \ nft_trans_container_elem(trans)->set #define nft_trans_elem_priv(trans) \ nft_trans_container_elem(trans)->elem_priv #define nft_trans_elem_set_bound(trans) \ nft_trans_container_elem(trans)->bound struct nft_trans_obj { struct nft_trans nft_trans; struct nft_object *obj; struct nft_object *newobj; bool update; }; #define nft_trans_container_obj(t) \ container_of(t, struct nft_trans_obj, nft_trans) #define nft_trans_obj(trans) \ nft_trans_container_obj(trans)->obj #define nft_trans_obj_newobj(trans) \ nft_trans_container_obj(trans)->newobj #define nft_trans_obj_update(trans) \ nft_trans_container_obj(trans)->update struct nft_trans_flowtable { struct nft_trans nft_trans; struct nft_flowtable *flowtable; struct list_head hook_list; u32 flags; bool update; }; #define nft_trans_container_flowtable(t) \ container_of(t, struct nft_trans_flowtable, nft_trans) #define nft_trans_flowtable(trans) \ nft_trans_container_flowtable(trans)->flowtable #define nft_trans_flowtable_update(trans) \ nft_trans_container_flowtable(trans)->update #define nft_trans_flowtable_hooks(trans) \ nft_trans_container_flowtable(trans)->hook_list #define nft_trans_flowtable_flags(trans) \ nft_trans_container_flowtable(trans)->flags #define NFT_TRANS_GC_BATCHCOUNT 256 struct nft_trans_gc { struct list_head list; struct net *net; struct nft_set *set; u32 seq; u16 count; struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; static inline void nft_ctx_update(struct nft_ctx *ctx, const struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_NEWRULE: case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: ctx->chain = nft_trans_rule_chain(trans); break; case NFT_MSG_NEWCHAIN: case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: ctx->chain = nft_trans_chain(trans); break; default: ctx->chain = NULL; break; } ctx->net = trans->net; ctx->table = trans->table; ctx->family = trans->table->family; ctx->report = trans->report; ctx->flags = trans->flags; ctx->seq = trans->seq; } struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq); struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif struct nftables_pernet { struct list_head tables; struct list_head commit_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; }; extern unsigned int nf_tables_net_id; static inline struct nftables_pernet *nft_pernet(const struct net *net) { return net_generic(net, nf_tables_net_id); } static inline u64 nft_net_tstamp(const struct net *net) { return nft_pernet(net)->tstamp; } #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) { return expr->ops->reduce == NFT_REDUCE_READONLY; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); static inline bool nft_reg_track_cmp(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg) { return track->regs[dreg].selector && track->regs[dreg].selector->ops == expr->ops && track->regs[dreg].num_reg == 0; } #endif /* _NET_NF_TABLES_H */ |
| 25 135 144 143 315 303 71 1 1 1 129 130 93 118 137 17 94 42 3 44 2 2 2 3 2 1 1 3 22 6 47 297 298 290 242 290 9 70 294 23 183 79 171 133 86 137 38 138 139 139 49 49 49 2 48 49 49 137 2 2 139 139 132 47 137 137 137 139 139 140 135 139 139 143 143 143 143 132 22 315 258 300 19 51 21 49 76 70 12 76 76 5 31 50 1 50 1 315 39 95 267 311 15 1 314 310 15 2 316 314 76 293 13 12 316 139 236 95 264 265 267 264 168 95 95 35 73 315 309 15 315 315 95 265 302 64 310 15 267 267 95 95 12 12 135 169 88 88 88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/sort.h> #include "messages.h" #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" #include "tree-mod-log.h" #include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_ref_node_cachep; struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees * we queue up extent allocations and backref maintenance for * delayed processing. This avoids deep call chains where we * add extents in the middle of btrfs_search_slot, and it allows * us to buffer up frequently modified backrefs in an rb tree instead * of hammering updates on the extent allocation tree. */ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; bool ret = false; u64 reserved; spin_lock(&global_rsv->lock); reserved = global_rsv->reserved; spin_unlock(&global_rsv->lock); /* * Since the global reserve is just kind of magic we don't really want * to rely on it to save our bacon, so if our size is more than the * delayed_refs_rsv and the global rsv then it's time to think about * bailing. */ spin_lock(&delayed_refs_rsv->lock); reserved += delayed_refs_rsv->reserved; if (delayed_refs_rsv->size >= reserved) ret = true; spin_unlock(&delayed_refs_rsv->lock); return ret; } /* * Release a ref head's reservation. * * @fs_info: the filesystem * @nr_refs: number of delayed refs to drop * @nr_csums: number of csum items to drop * * Drops the delayed ref head's count from the delayed refs rsv and free any * excess reservation we had. */ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; u64 num_bytes; u64 released; num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Adjust the size of the delayed refs rsv. * * This is to be called anytime we may have adjusted trans->delayed_ref_updates * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and * add it to the delayed_refs_rsv. */ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; u64 num_bytes; u64 reserved_bytes; num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, trans->delayed_ref_csum_deletions); if (num_bytes == 0) return; /* * Try to take num_bytes from the transaction's local delayed reserve. * If not possible, try to take as much as it's available. If the local * reserve doesn't have enough reserved space, the delayed refs reserve * will be refilled next time btrfs_delayed_refs_rsv_refill() is called * by someone or if a transaction commit is triggered before that, the * global block reserve will be used. We want to minimize using the * global block reserve for cases we can account for in advance, to * avoid exhausting it and reach -ENOSPC during a transaction commit. */ spin_lock(&local_rsv->lock); reserved_bytes = min(num_bytes, local_rsv->reserved); local_rsv->reserved -= reserved_bytes; local_rsv->full = (local_rsv->reserved >= local_rsv->size); spin_unlock(&local_rsv->lock); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; delayed_rsv->reserved += reserved_bytes; delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; trans->delayed_ref_csum_deletions = 0; } /* * Adjust the size of the delayed refs block reserve for 1 block group item * insertion, used after allocating a block group. */ void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; spin_lock(&delayed_rsv->lock); /* * Inserting a block group item does not require changing the free space * tree, only the extent tree or the block group tree, so this is all we * need. */ delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); } /* * Adjust the size of the delayed refs block reserve to release space for 1 * block group item insertion. */ void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); u64 released; released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); if (released > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Adjust the size of the delayed refs block reserve for 1 block group item * update. */ void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; spin_lock(&delayed_rsv->lock); /* * Updating a block group item does not result in new nodes/leaves and * does not require changing the free space tree, only the extent tree * or the block group tree, so this is all we need. */ delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); } /* * Adjust the size of the delayed refs block reserve to release space for 1 * block group item update. */ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); u64 released; released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); if (released > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Refill based on our delayed refs usage. * * @fs_info: the filesystem * @flush: control how we can flush for this reservation. * * This will refill the delayed block_rsv up to 1 items size worth of space and * will return -ENOSPC if we can't make the reservation. */ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; struct btrfs_space_info *space_info = block_rsv->space_info; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; u64 refilled_bytes; u64 to_free; int ret = -ENOSPC; spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; num_bytes = min(num_bytes, limit); } spin_unlock(&block_rsv->lock); if (!num_bytes) return 0; ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); if (ret) return ret; /* * We may have raced with someone else, so check again if we the block * reserve is still not full and release any excess space. */ spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) { u64 needed = block_rsv->size - block_rsv->reserved; if (num_bytes >= needed) { block_rsv->reserved += needed; block_rsv->full = true; to_free = num_bytes - needed; refilled_bytes = needed; } else { block_rsv->reserved += num_bytes; to_free = 0; refilled_bytes = num_bytes; } } else { to_free = num_bytes; refilled_bytes = 0; } spin_unlock(&block_rsv->lock); if (to_free > 0) btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, refilled_bytes, 1); return 0; } /* * compare two delayed data backrefs with same bytenr and type */ static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, struct btrfs_delayed_ref_node *ref2) { if (ref1->data_ref.objectid < ref2->data_ref.objectid) return -1; if (ref1->data_ref.objectid > ref2->data_ref.objectid) return 1; if (ref1->data_ref.offset < ref2->data_ref.offset) return -1; if (ref1->data_ref.offset > ref2->data_ref.offset) return 1; return 0; } static int comp_refs(struct btrfs_delayed_ref_node *ref1, struct btrfs_delayed_ref_node *ref2, bool check_seq) { int ret = 0; if (ref1->type < ref2->type) return -1; if (ref1->type > ref2->type) return 1; if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_DATA_REF_KEY) { if (ref1->parent < ref2->parent) return -1; if (ref1->parent > ref2->parent) return 1; } else { if (ref1->ref_root < ref2->ref_root) return -1; if (ref1->ref_root > ref2->ref_root) return -1; if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) ret = comp_data_refs(ref1, ref2); } if (ret) return ret; if (check_seq) { if (ref1->seq < ref2->seq) return -1; if (ref1->seq > ref2->seq) return 1; } return 0; } /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, struct rb_node *node) { struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_head *entry; struct btrfs_delayed_ref_head *ins; u64 bytenr; bool leftmost = true; ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); bytenr = ins->bytenr; while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, href_node); if (bytenr < entry->bytenr) { p = &(*p)->rb_left; } else if (bytenr > entry->bytenr) { p = &(*p)->rb_right; leftmost = false; } else { return entry; } } rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); return NULL; } static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { struct rb_node **p = &root->rb_root.rb_node; struct rb_node *node = &ins->ref_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_node *entry; bool leftmost = true; while (*p) { int comp; parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, ref_node); comp = comp_refs(ins, entry, true); if (comp < 0) { p = &(*p)->rb_left; } else if (comp > 0) { p = &(*p)->rb_right; leftmost = false; } else { return entry; } } rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); return NULL; } static struct btrfs_delayed_ref_head *find_first_ref_head( struct btrfs_delayed_ref_root *dr) { struct rb_node *n; struct btrfs_delayed_ref_head *entry; n = rb_first_cached(&dr->href_root); if (!n) return NULL; entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); return entry; } /* * Find a head entry based on bytenr. This returns the delayed ref head if it * was able to find one, or NULL if nothing was in that spot. If return_bigger * is given, the next bigger entry is returned if no exact match is found. */ static struct btrfs_delayed_ref_head *find_ref_head( struct btrfs_delayed_ref_root *dr, u64 bytenr, bool return_bigger) { struct rb_root *root = &dr->href_root.rb_root; struct rb_node *n; struct btrfs_delayed_ref_head *entry; n = root->rb_node; entry = NULL; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); if (bytenr < entry->bytenr) n = n->rb_left; else if (bytenr > entry->bytenr) n = n->rb_right; else return entry; } if (entry && return_bigger) { if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) return NULL; entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); } return entry; } return NULL; } int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) return 0; refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); if (RB_EMPTY_NODE(&head->href_node)) { mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return -EAGAIN; } btrfs_put_delayed_ref_head(head); return 0; } static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { lockdep_assert_held(&head->lock); rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } static bool merge_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) { struct btrfs_delayed_ref_node *next; struct rb_node *node = rb_next(&ref->ref_node); bool done = false; while (!done && node) { int mod; next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); node = rb_next(node); if (seq && next->seq >= seq) break; if (comp_refs(ref, next, false)) break; if (ref->action == next->action) { mod = next->ref_mod; } else { if (ref->ref_mod < next->ref_mod) { swap(ref, next); done = true; } mod = -next->ref_mod; } drop_delayed_ref(fs_info, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { drop_delayed_ref(fs_info, delayed_refs, head, ref); done = true; } else { /* * Can't have multiples of the same ref on a tree block. */ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } } return done; } void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; lockdep_assert_held(&head->lock); if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return; /* We don't have too many refs to merge for data. */ if (head->is_data) return; seq = btrfs_tree_mod_log_lowest_seq(fs_info); again: for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; if (merge_ref(fs_info, delayed_refs, head, ref, seq)) goto again; } } int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) { int ret = 0; u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); if (min_seq != 0 && seq >= min_seq) { btrfs_debug(fs_info, "holding back delayed_ref %llu, lowest is %llu", seq, min_seq); ret = 1; } return ret; } struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; lockdep_assert_held(&delayed_refs->lock); again: head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, true); if (!head && delayed_refs->run_delayed_start != 0) { delayed_refs->run_delayed_start = 0; head = find_first_ref_head(delayed_refs); } if (!head) return NULL; while (head->processing) { struct rb_node *node; node = rb_next(&head->href_node); if (!node) { if (delayed_refs->run_delayed_start == 0) return NULL; delayed_refs->run_delayed_start = 0; goto again; } head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); } head->processing = true; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + head->num_bytes; return head; } void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&head->lock); rb_erase_cached(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); delayed_refs->num_heads--; if (!head->processing) delayed_refs->num_heads_ready--; } /* * Helper to insert the ref_node to the tail or merge with tail. * * Return false if the ref was inserted. * Return true if the ref was merged into an existing one (and therefore can be * freed by the caller). */ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; struct btrfs_delayed_ref_node *exist; int mod; spin_lock(&href->lock); exist = tree_insert(&href->ref_tree, ref); if (!exist) { if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); spin_unlock(&href->lock); trans->delayed_ref_updates++; return false; } /* Now we are sure we can merge */ if (exist->action == ref->action) { mod = ref->ref_mod; } else { /* Need to change action */ if (exist->ref_mod < ref->ref_mod) { exist->action = ref->action; mod = -exist->ref_mod; exist->ref_mod = ref->ref_mod; if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&exist->add_list, &href->ref_add_list); else if (ref->action == BTRFS_DROP_DELAYED_REF) { ASSERT(!list_empty(&exist->add_list)); list_del(&exist->add_list); } else { ASSERT(0); } } else mod = -ref->ref_mod; } exist->ref_mod += mod; /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) drop_delayed_ref(trans->fs_info, root, href, exist); spin_unlock(&href->lock); return true; } /* * helper function to update the accounting in the head ref * existing and update must have the same bytenr */ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, struct btrfs_delayed_ref_head *update) { struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod; BUG_ON(existing->is_data != update->is_data); spin_lock(&existing->lock); /* * When freeing an extent, we may not know the owning root when we * first create the head_ref. However, some deref before the last deref * will know it, so we just need to update the head_ref accordingly. */ if (!existing->owning_root) existing->owning_root = update->owning_root; if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref * entries were processed, we can end up * with an existing head ref without * the must_insert_reserved flag set. * Set it again here */ existing->must_insert_reserved = update->must_insert_reserved; existing->owning_root = update->owning_root; /* * update the num_bytes so we make sure the accounting * is done correctly */ existing->num_bytes = update->num_bytes; } if (update->extent_op) { if (!existing->extent_op) { existing->extent_op = update->extent_op; } else { if (update->extent_op->update_key) { memcpy(&existing->extent_op->key, &update->extent_op->key, sizeof(update->extent_op->key)); existing->extent_op->update_key = true; } if (update->extent_op->update_flags) { existing->extent_op->flags_to_set |= update->extent_op->flags_to_set; existing->extent_op->update_flags = true; } btrfs_free_delayed_extent_op(update->extent_op); } } /* * update the reference mod on the head to reflect this new operation, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing->total_ref_mod; existing->ref_mod += update->ref_mod; existing->total_ref_mod += update->ref_mod; /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. * We reserve bytes for csum deletion when adding or updating a ref head * see add_delayed_ref_head() for more details. */ if (existing->is_data) { u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, existing->num_bytes); if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); } if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; trans->delayed_ref_csum_deletions += csum_leaves; } } spin_unlock(&existing->lock); } static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, struct btrfs_ref *generic_ref, struct btrfs_qgroup_extent_record *qrecord, u64 reserved) { int count_mod = 1; bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); switch (generic_ref->action) { case BTRFS_ADD_DELAYED_REF: /* count_mod is already set to 1. */ break; case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; break; case BTRFS_DROP_DELAYED_REF: /* * The head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ count_mod = -1; break; case BTRFS_ADD_DELAYED_EXTENT: /* * BTRFS_ADD_DELAYED_EXTENT means that we need to update the * reserved accounting when the extent is finally added, or if a * later modification deletes the delayed ref without ever * inserting the extent into the extent allocation tree. * ref->must_insert_reserved is the flag used to record that * accounting mods are required. * * Once we record must_insert_reserved, switch the action to * BTRFS_ADD_DELAYED_REF because other special casing is not * required. */ must_insert_reserved = true; break; } refcount_set(&head_ref->refs, 1); head_ref->bytenr = generic_ref->bytenr; head_ref->num_bytes = generic_ref->num_bytes; head_ref->ref_mod = count_mod; head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; head_ref->owning_root = generic_ref->owning_root; head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); /* If not metadata set an impossible level to help debugging. */ if (generic_ref->type == BTRFS_REF_METADATA) head_ref->level = generic_ref->tree_ref.level; else head_ref->level = U8_MAX; if (qrecord) { if (generic_ref->ref_root && reserved) { qrecord->data_rsv = reserved; qrecord->data_rsv_refroot = generic_ref->ref_root; } qrecord->bytenr = generic_ref->bytenr; qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } } /* * helper function to actually insert a head node into the rbtree. * this does all the dirty work in terms of maintaining the correct * overall modification count. */ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; /* Record qgroup extent info if provided */ if (qrecord) { if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, delayed_refs, qrecord)) kfree(qrecord); else qrecord_inserted = true; } trace_add_delayed_ref_head(trans->fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { update_existing_head_ref(trans, existing, head_ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { /* * We reserve the amount of bytes needed to delete csums when * adding the ref head and not when adding individual drop refs * since the csum items are deleted only after running the last * delayed drop ref (the data extent's ref count drops to 0). */ if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_csum_deletions += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; return head_ref; } /* * Initialize the structure which represents a modification to a an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * * @ref: The structure which is going to be initialized. * * @bytenr: The logical address of the extent for which a modification is * going to be recorded. * * @num_bytes: Size of the extent whose modification is being recorded. * * @ref_root: The id of the root where this modification has originated, this * can be either one of the well-known metadata trees or the * subvolume id which references this extent. * * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or * BTRFS_ADD_DELAYED_EXTENT * * @ref_type: Holds the type of the extent which is being recorded, can be * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ * BTRFS_EXTENT_DATA_REF_KEY when recording data extent */ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, struct btrfs_ref *generic_ref) { int action = generic_ref->action; u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; if (is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); ref->bytenr = generic_ref->bytenr; ref->num_bytes = generic_ref->num_bytes; ref->ref_mod = 1; ref->action = action; ref->seq = seq; ref->type = btrfs_ref_type(generic_ref); ref->ref_root = generic_ref->ref_root; ref->parent = generic_ref->parent; RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); if (generic_ref->type == BTRFS_REF_DATA) ref->data_ref = generic_ref->data_ref; else ref->tree_ref = generic_ref->tree_ref; } void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif generic_ref->tree_ref.level = level; generic_ref->type = BTRFS_REF_METADATA; if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && (!mod_root || is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; } void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif generic_ref->data_ref.objectid = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && (!mod_root || is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; } static int add_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op, u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *node; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; bool qrecord_inserted; int action = generic_ref->action; bool merged; node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); if (!node) return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { kmem_cache_free(btrfs_delayed_ref_node_cachep, node); return -ENOMEM; } if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_ref_node_cachep, node); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); return -ENOMEM; } } init_delayed_ref_common(fs_info, node, generic_ref); init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); /* * insert both the head node and the new ref without dropping * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); merged = insert_delayed_ref(trans, head_ref, node); spin_unlock(&delayed_refs->lock); /* * Need to update the delayed_refs_rsv with any changes we may have * made. */ btrfs_update_delayed_refs_rsv(trans); if (generic_ref->type == BTRFS_REF_DATA) trace_add_delayed_data_ref(trans->fs_info, node); else trace_add_delayed_tree_ref(trans->fs_info, node); if (merged) kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record); return 0; } /* * Add a delayed tree ref. This does all of the accounting required to make sure * the delayed ref is eventually processed before this transaction commits. */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op) { ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); return add_delayed_ref(trans, generic_ref, extent_op, 0); } /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved) { ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); return add_delayed_ref(trans, generic_ref, NULL, reserved); } int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_ref generic_ref = { .type = BTRFS_REF_METADATA, .action = BTRFS_UPDATE_DELAYED_HEAD, .bytenr = bytenr, .num_bytes = num_bytes, .tree_ref.level = level, }; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, NULL); spin_unlock(&delayed_refs->lock); /* * Need to update the delayed_refs_rsv with any changes we may have * made. */ btrfs_update_delayed_refs_rsv(trans); return 0; } void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { if (refcount_dec_and_test(&ref->refs)) { WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); } } /* * This does a simple search for the head node for a given extent. Returns the * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { lockdep_assert_held(&delayed_refs->lock); return find_ref_head(delayed_refs, bytenr, false); } static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) { int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; if (type < entry->type) return -1; if (type > entry->type) return 1; if (type == BTRFS_TREE_BLOCK_REF_KEY) { if (root < entry->ref_root) return -1; if (root > entry->ref_root) return 1; } else { if (parent < entry->parent) return -1; if (parent > entry->parent) return 1; } return 0; } /* * Check to see if a given root/parent reference is attached to the head. This * only checks for BTRFS_ADD_DELAYED_REF references that match, as that * indicates the reference exists for the given root or parent. This is for * tree blocks only. * * @head: the head of the bytenr we're searching. * @root: the root objectid of the reference if it is a normal reference. * @parent: the parent if this is a shared backref. */ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent) { struct rb_node *node; bool found = false; lockdep_assert_held(&head->mutex); spin_lock(&head->lock); node = head->ref_tree.rb_root.rb_node; while (node) { struct btrfs_delayed_ref_node *entry; int ret; entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); ret = find_comp(entry, root, parent); if (ret < 0) { node = node->rb_left; } else if (ret > 0) { node = node->rb_right; } else { /* * We only want to count ADD actions, as drops mean the * ref doesn't exist. */ if (entry->action == BTRFS_ADD_DELAYED_REF) found = true; break; } } spin_unlock(&head->lock); return found; } void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); kmem_cache_destroy(btrfs_delayed_ref_node_cachep); kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) goto fail; btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); if (!btrfs_delayed_ref_node_cachep) goto fail; btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); if (!btrfs_delayed_extent_op_cachep) goto fail; return 0; fail: btrfs_delayed_ref_exit(); return -ENOMEM; } |
| 2 2 1 1 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 | // SPDX-License-Identifier: GPL-2.0 /* * Author: Andrei Vagin <avagin@openvz.org> * Author: Dmitry Safonov <dima@arista.com> */ #include <linux/time_namespace.h> #include <linux/user_namespace.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/clocksource.h> #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> #include <vdso/datapage.h> ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) { ktime_t offset; switch (clockid) { case CLOCK_MONOTONIC: offset = timespec64_to_ktime(ns_offsets->monotonic); break; case CLOCK_BOOTTIME: case CLOCK_BOOTTIME_ALARM: offset = timespec64_to_ktime(ns_offsets->boottime); break; default: return tim; } /* * Check that @tim value is in [offset, KTIME_MAX + offset] * and subtract offset. */ if (tim < offset) { /* * User can specify @tim *absolute* value - if it's lesser than * the time namespace's offset - it's already expired. */ tim = 0; } else { tim = ktime_sub(tim, offset); if (unlikely(tim > KTIME_MAX)) tim = KTIME_MAX; } return tim; } static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); } static void dec_time_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); } /** * clone_time_ns - Clone a time namespace * @user_ns: User namespace which owns a new namespace. * @old_ns: Namespace to clone * * Clone @old_ns and set the clone refcount to 1 * * Return: The new namespace or ERR_PTR. */ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, struct time_namespace *old_ns) { struct time_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; ucounts = inc_time_namespaces(user_ns); if (!ucounts) goto fail; err = -ENOMEM; ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; refcount_set(&ns->ns.count, 1); ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free_page; ns->ucounts = ucounts; ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; return ns; fail_free_page: __free_page(ns->vvar_page); fail_free: kfree(ns); fail_dec: dec_time_namespaces(ucounts); fail: return ERR_PTR(err); } /** * copy_time_ns - Create timens_for_children from @old_ns * @flags: Cloning flags * @user_ns: User namespace which owns a new namespace. * @old_ns: Namespace to clone * * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; * adds a refcounter to @old_ns otherwise. * * Return: timens_for_children namespace or ERR_PTR. */ struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) return get_time_ns(old_ns); return clone_time_ns(user_ns, old_ns); } static struct timens_offset offset_from_ts(struct timespec64 off) { struct timens_offset ret; ret.sec = off.tv_sec; ret.nsec = off.tv_nsec; return ret; } /* * A time namespace VVAR page has the same layout as the VVAR page which * contains the system wide VDSO data. * * For a normal task the VVAR pages are installed in the normal ordering: * VVAR * PVCLOCK * HVCLOCK * TIMENS <- Not really required * * Now for a timens task the pages are installed in the following order: * TIMENS * PVCLOCK * HVCLOCK * VVAR * * The check for vdso_data->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ static void timens_setup_vdso_data(struct vdso_data *vdata, struct time_namespace *ns) { struct timens_offset *offset = vdata->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); vdata->seq = 1; vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; offset[CLOCK_BOOTTIME] = boottime; offset[CLOCK_BOOTTIME_ALARM] = boottime; } struct page *find_timens_vvar_page(struct vm_area_struct *vma) { if (likely(vma->vm_mm == current->mm)) return current->nsproxy->time_ns->vvar_page; /* * VM_PFNMAP | VM_IO protect .fault() handler from being called * through interfaces like /proc/$pid/mem or * process_vm_{readv,writev}() as long as there's no .access() * in special_mapping_vmops(). * For more details check_vma_flags() and __access_remote_vm() */ WARN(1, "vvar_page accessed remotely"); return NULL; } /* * Protects possibly multiple offsets writers racing each other * and tasks entering the namespace. */ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { struct vdso_data *vdata; unsigned int i; if (ns == &init_time_ns) return; /* Fast-path, taken by every task in namespace except the first. */ if (likely(ns->frozen_offsets)) return; mutex_lock(&offset_lock); /* Nothing to-do: vvar_page has been already initialized. */ if (ns->frozen_offsets) goto out; ns->frozen_offsets = true; vdata = arch_get_vdso_data(page_address(ns->vvar_page)); for (i = 0; i < CS_BASES; i++) timens_setup_vdso_data(&vdata[i], ns); out: mutex_unlock(&offset_lock); } void free_time_ns(struct time_namespace *ns) { dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); __free_page(ns->vvar_page); kfree(ns); } static struct time_namespace *to_time_ns(struct ns_common *ns) { return container_of(ns, struct time_namespace, ns); } static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->time_ns; get_time_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static struct ns_common *timens_for_children_get(struct task_struct *task) { struct time_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->time_ns_for_children; get_time_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void timens_put(struct ns_common *ns) { put_time_ns(to_time_ns(ns)); } void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { timens_set_vvar_page(tsk, ns); vdso_join_timens(tsk, ns); } static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); if (!current_is_single_threaded()) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; get_time_ns(ns); put_time_ns(nsproxy->time_ns_for_children); nsproxy->time_ns_for_children = ns; return 0; } void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) return; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; timens_commit(tsk, ns); } static struct user_namespace *timens_owner(struct ns_common *ns) { return to_time_ns(ns)->user_ns; } static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) { char *clock; switch (clockid) { case CLOCK_BOOTTIME: clock = "boottime"; break; case CLOCK_MONOTONIC: clock = "monotonic"; break; default: clock = "unknown"; break; } seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) { struct ns_common *ns; struct time_namespace *time_ns; ns = timens_for_children_get(p); if (!ns) return; time_ns = to_time_ns(ns); show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); put_time_ns(time_ns); } int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int noffsets) { struct ns_common *ns; struct time_namespace *time_ns; struct timespec64 tp; int i, err; ns = timens_for_children_get(p); if (!ns) return -ESRCH; time_ns = to_time_ns(ns); if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { put_time_ns(time_ns); return -EPERM; } for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; switch (off->clockid) { case CLOCK_MONOTONIC: ktime_get_ts64(&tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(&tp); break; default: err = -EINVAL; goto out; } err = -ERANGE; if (off->val.tv_sec > KTIME_SEC_MAX || off->val.tv_sec < -KTIME_SEC_MAX) goto out; tp = timespec64_add(tp, off->val); /* * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is * still unreachable. */ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) goto out; } mutex_lock(&offset_lock); if (time_ns->frozen_offsets) { err = -EACCES; goto out_unlock; } err = 0; /* Don't report errors after this line */ for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; struct timespec64 *offset = NULL; switch (off->clockid) { case CLOCK_MONOTONIC: offset = &time_ns->offsets.monotonic; break; case CLOCK_BOOTTIME: offset = &time_ns->offsets.boottime; break; } *offset = off->val; } out_unlock: mutex_unlock(&offset_lock); out: put_time_ns(time_ns); return err; } const struct proc_ns_operations timens_operations = { .name = "time", .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, .owner = timens_owner, }; const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, .owner = timens_owner, }; struct time_namespace init_time_ns = { .ns.count = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, .frozen_offsets = true, }; |
| 21 4349 21 21 21 21 21 21 21 21 21 4390 4391 4397 4389 4346 4386 4267 4269 4276 15 4267 26 7 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 | // SPDX-License-Identifier: GPL-2.0 /* * Workingset detection * * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> #include "internal.h" /* * Double CLOCK lists * * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list * are promoted to the active list, to protect them from reclaim, * whereas active pages are demoted to the inactive list when the * active list grows too big. * * fault ------------------------+ * | * +--------------+ | +-------------+ * reclaim <- | inactive | <-+-- demotion | active | <--+ * +--------------+ +-------------+ | * | | * +-------------- promotion ------------------+ * * * Access frequency and refault distance * * A workload is thrashing when its pages are frequently used but they * are evicted from the inactive list every time before another access * would have promoted them to the active list. * * In cases where the average access distance between thrashing pages * is bigger than the size of memory there is nothing that can be * done - the thrashing set could never fit into memory under any * circumstance. * * However, the average access distance could be bigger than the * inactive list, yet smaller than the size of memory. In this case, * the set could fit into memory if it weren't for the currently * active pages - which may be used more, hopefully less frequently: * * +-memory available to cache-+ * | | * +-inactive------+-active----+ * a b | c d e f g h i | J K L M N | * +---------------+-----------+ * * It is prohibitively expensive to accurately track access frequency * of pages. But a reasonable approximation can be made to measure * thrashing on the inactive list, after which refaulting pages can be * activated optimistically to compete with the existing active pages. * * Approximating inactive page access frequency - Observations: * * 1. When a page is accessed for the first time, it is added to the * head of the inactive list, slides every existing inactive page * towards the tail by one slot, and pushes the current tail page * out of memory. * * 2. When a page is accessed for the second time, it is promoted to * the active list, shrinking the inactive list by one slot. This * also slides all inactive pages that were faulted into the cache * more recently than the activated page towards the tail of the * inactive list. * * Thus: * * 1. The sum of evictions and activations between any two points in * time indicate the minimum number of inactive pages accessed in * between. * * 2. Moving one inactive page N page slots towards the tail of the * list requires at least N inactive page accesses. * * Combining these: * * 1. When a page is finally evicted from memory, the number of * inactive pages accessed while the page was in cache is at least * the number of page slots on the inactive list. * * 2. In addition, measuring the sum of evictions and activations (E) * at the time of a page's eviction, and comparing it to another * reading (R) at the time the page faults back into memory tells * the minimum number of accesses while the page was not cached. * This is called the refault distance. * * Because the first access of the page was the fault and the second * access the refault, we combine the in-cache distance with the * out-of-cache distance to get the complete minimum access distance * of this page: * * NR_inactive + (R - E) * * And knowing the minimum access distance of a page, we can easily * tell if the page would be able to stay in cache assuming all page * slots in the cache were available: * * NR_inactive + (R - E) <= NR_inactive + NR_active * * If we have swap we should consider about NR_inactive_anon and * NR_active_anon, so for page cache and anonymous respectively: * * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file * + NR_inactive_anon + NR_active_anon * * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon * + NR_inactive_file + NR_active_file * * Which can be further simplified to: * * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon * * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list * had (R - E) more page slots, the page would not have been evicted * in between accesses, but activated instead. And on a full system, * the only thing eating into inactive list space is active pages. * * * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given * time there is actually a good chance that pages on the active list * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at * least (R - E) pages in the userspace workingset, the refaulting page * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. * * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * * Refaulting active pages * * If on the other hand the refaulting pages have recently been * deactivated, it means that the active list is no longer protecting * actively used cache from reclaim. The cache is NOT transitioning to * a different workingset; the existing workingset is thrashing in the * space allocated to the page cache. * * * Implementation * * For each node's LRU lists, a counter for inactive evictions and * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ #define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; } #ifdef CONFIG_LRU_GEN static void *lru_gen_eviction(struct folio *folio) { int hist; unsigned long token; unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); int tier = lru_tier_from_refs(refs); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); } static void lru_gen_refault(struct folio *folio, void *shadow) { bool recent; int hist, tier, refs; bool workingset; unsigned long token; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); rcu_read_lock(); recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); if (!recent) goto unlock; lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); /* see the comment in folio_lru_refs() */ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; tier = lru_tier_from_refs(refs); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); /* * Count the following two cases as stalls: * 1. For pages accessed through page tables, hotter pages pushed out * hot pages which refaulted immediately. * 2. For pages accessed multiple times through file descriptors, * they would have been protected by sort_folio(). */ if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } unlock: rcu_read_unlock(); } #else /* !CONFIG_LRU_GEN */ static void *lru_gen_eviction(struct folio *folio) { return NULL; } static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; } static void lru_gen_refault(struct folio *folio, void *shadow) { } #endif /* CONFIG_LRU_GEN */ /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as * well, in order for the refault distances later on to be comparable * to the in-memory dimensions. This function allows reclaim and LRU * operations to drive the non-resident aging along in parallel. */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a * round-robin fashion. That means that each cgroup has an LRU * order that is composed of the LRU orders of its child * cgroups; and every page has an LRU position not just in the * cgroup that owns it, but in all of that group's ancestors. * * So when the physical inactive list of a leaf cgroup ages, * the virtual inactive lists of all its parents, including * the root cgroup's, age as well. */ do { atomic_long_add(nr_pages, &lruvec->nonresident_age); } while ((lruvec = parent_lruvec(lruvec))); } /** * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim * @folio: the folio being evicted * * Return: a shadow entry to be stored in @folio->mapping->i_pages in place * of the evicted @folio so that a later refault can be detected. */ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; /* Folio is fully exclusive and pins folio's memory cgroup pointer */ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } /** * workingset_test_recent - tests if the shadow entry is for a folio that was * recently evicted. Also fills in @workingset with the value unpacked from * shadow. * @shadow: the shadow entry to be tested. * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. * @flush: whether to flush cgroup rstat. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; int memcgid; struct pglist_data *pgdat; unsigned long eviction; rcu_read_lock(); if (lru_gen_enabled()) { bool recent = lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging * algorithm's job to shake out the minimum access frequency * for the active cache. * * XXX: On !CONFIG_MEMCG, this will always return NULL; it * would be better if the root_mem_cgroup existed in all * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { rcu_read_unlock(); return false; } rcu_read_unlock(); /* * Flush stats (and potentially sleep) outside the RCU read section. * * Note that workingset_test_recent() itself might be called in RCU read * section (for e.g, in cachestat) - these callers need to skip flushing * stats (via the flush argument). * * XXX: With per-memcg flushing and thresholding, is ratelimiting * still needed here? */ if (flush) mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the * nonresident_age to lap a shadow entry in the field, which * can then result in a false small refault distance, leading * to a false activation should this old entry actually * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON); } } mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } /** * workingset_refault - Evaluate the refault of a previously evicted folio. * @folio: The freshly allocated replacement folio. * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; long nr; if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; } /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); /* * XXX: Move to folio_add_lru() when it supports new vs * putback */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } } /** * workingset_activation - note a page activation * @folio: Folio that is being activated. */ void workingset_activation(struct folio *folio) { struct mem_cgroup *memcg; rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. * * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ memcg = folio_memcg_rcu(folio); if (!mem_cgroup_disabled() && !memcg) goto out; workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); out: rcu_read_unlock(); } /* * Shadow entries reflect the share of the working set that does not * fit into memory, so their number depends on the access pattern of * the workload. In most cases, they will refault or get reclaimed * along with the inode, but a (malicious) workload that streams * through files with a total size several times that of available * memory, while preventing the inodes from being reclaimed, can * create excessive amounts of shadow nodes. To keep a lid on this, * track shadow nodes and reclaim them when they grow way past the * point where they would still be useful. */ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct address_space *mapping; struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ mapping = container_of(node->array, struct address_space, i_pages); lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_node_page_state(page, WORKINGSET_NODES); } } } static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long max_nodes; unsigned long nodes; unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); if (!nodes) return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. * * The size of the active list converges toward 100% of * overall page cache as memory grows, with only a tiny * inactive list. Assume the total cache size for that. * * Nodes might be sparsely populated, with only one shadow * entry in the extreme case. Obviously, we cannot keep one * node for every eligible shadow entry, so compromise on a * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; int i; mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __must_hold(lru_lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; int ret; /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } /* For page cache we need to hold i_lock */ if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } } list_lru_isolate(lru, item); __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); spin_unlock(lru_lock); /* * The nodes should only contain one or more shadow entries, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; out: cond_resched(); spin_lock_irq(lru_lock); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { /* list_lru lock nests inside the IRQ-safe i_pages lock */ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, NULL); } /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. */ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-shadow"); if (!workingset_shadow_shrinker) goto err; ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, workingset_shadow_shrinker); if (ret) goto err_list_lru; workingset_shadow_shrinker->count_objects = count_shadow_nodes; workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; /* ->count reports only fully expendable nodes */ workingset_shadow_shrinker->seeks = 0; shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: shrinker_free(workingset_shadow_shrinker); err: return ret; } module_init(workingset_init); |
| 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 | // SPDX-License-Identifier: GPL-2.0-only /* * scsi_pm.c Copyright (C) 2010 Alan Stern * * SCSI dynamic Power Management * Initial version: Alan Stern <stern@rowland.harvard.edu> */ #include <linux/pm_runtime.h> #include <linux/export.h> #include <linux/blk-pm.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_host.h> #include "scsi_priv.h" #ifdef CONFIG_PM_SLEEP static int do_scsi_suspend(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->suspend ? pm->suspend(dev) : 0; } static int do_scsi_freeze(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->freeze ? pm->freeze(dev) : 0; } static int do_scsi_poweroff(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->poweroff ? pm->poweroff(dev) : 0; } static int do_scsi_resume(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->resume ? pm->resume(dev) : 0; } static int do_scsi_thaw(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->thaw ? pm->thaw(dev) : 0; } static int do_scsi_restore(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->restore ? pm->restore(dev) : 0; } static int scsi_dev_type_suspend(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err; err = scsi_device_quiesce(to_scsi_device(dev)); if (err == 0) { err = cb(dev, pm); if (err) scsi_device_resume(to_scsi_device(dev)); } dev_dbg(dev, "scsi suspend: %d\n", err); return err; } static int scsi_bus_suspend_common(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { if (!scsi_is_sdev_device(dev)) return 0; return scsi_dev_type_suspend(dev, cb); } static int scsi_bus_resume_common(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err; if (!scsi_is_sdev_device(dev)) return 0; err = cb(dev, pm); scsi_device_resume(to_scsi_device(dev)); dev_dbg(dev, "scsi resume: %d\n", err); return err; } static int scsi_bus_prepare(struct device *dev) { if (scsi_is_host_device(dev)) { /* Wait until async scanning is finished */ scsi_complete_async_scans(); } return 0; } static int scsi_bus_suspend(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_suspend); } static int scsi_bus_resume(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_resume); } static int scsi_bus_freeze(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_freeze); } static int scsi_bus_thaw(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_thaw); } static int scsi_bus_poweroff(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_poweroff); } static int scsi_bus_restore(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_restore); } #else /* CONFIG_PM_SLEEP */ #define scsi_bus_prepare NULL #define scsi_bus_suspend NULL #define scsi_bus_resume NULL #define scsi_bus_freeze NULL #define scsi_bus_thaw NULL #define scsi_bus_poweroff NULL #define scsi_bus_restore NULL #endif /* CONFIG_PM_SLEEP */ static int sdev_runtime_suspend(struct device *dev) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; struct scsi_device *sdev = to_scsi_device(dev); int err = 0; err = blk_pre_runtime_suspend(sdev->request_queue); if (err) return err; if (pm && pm->runtime_suspend) err = pm->runtime_suspend(dev); blk_post_runtime_suspend(sdev->request_queue, err); return err; } static int scsi_runtime_suspend(struct device *dev) { int err = 0; dev_dbg(dev, "scsi_runtime_suspend\n"); if (scsi_is_sdev_device(dev)) err = sdev_runtime_suspend(dev); /* Insert hooks here for targets, hosts, and transport classes */ return err; } static int sdev_runtime_resume(struct device *dev) { struct scsi_device *sdev = to_scsi_device(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err = 0; blk_pre_runtime_resume(sdev->request_queue); if (pm && pm->runtime_resume) err = pm->runtime_resume(dev); blk_post_runtime_resume(sdev->request_queue); return err; } static int scsi_runtime_resume(struct device *dev) { int err = 0; dev_dbg(dev, "scsi_runtime_resume\n"); if (scsi_is_sdev_device(dev)) err = sdev_runtime_resume(dev); /* Insert hooks here for targets, hosts, and transport classes */ return err; } static int scsi_runtime_idle(struct device *dev) { dev_dbg(dev, "scsi_runtime_idle\n"); /* Insert hooks here for targets, hosts, and transport classes */ if (scsi_is_sdev_device(dev)) { pm_runtime_mark_last_busy(dev); pm_runtime_autosuspend(dev); return -EBUSY; } return 0; } int scsi_autopm_get_device(struct scsi_device *sdev) { int err; err = pm_runtime_get_sync(&sdev->sdev_gendev); if (err < 0 && err !=-EACCES) pm_runtime_put_sync(&sdev->sdev_gendev); else err = 0; return err; } EXPORT_SYMBOL_GPL(scsi_autopm_get_device); void scsi_autopm_put_device(struct scsi_device *sdev) { pm_runtime_put_sync(&sdev->sdev_gendev); } EXPORT_SYMBOL_GPL(scsi_autopm_put_device); void scsi_autopm_get_target(struct scsi_target *starget) { pm_runtime_get_sync(&starget->dev); } void scsi_autopm_put_target(struct scsi_target *starget) { pm_runtime_put_sync(&starget->dev); } int scsi_autopm_get_host(struct Scsi_Host *shost) { int err; err = pm_runtime_get_sync(&shost->shost_gendev); if (err < 0 && err !=-EACCES) pm_runtime_put_sync(&shost->shost_gendev); else err = 0; return err; } void scsi_autopm_put_host(struct Scsi_Host *shost) { pm_runtime_put_sync(&shost->shost_gendev); } const struct dev_pm_ops scsi_bus_pm_ops = { .prepare = scsi_bus_prepare, .suspend = scsi_bus_suspend, .resume = scsi_bus_resume, .freeze = scsi_bus_freeze, .thaw = scsi_bus_thaw, .poweroff = scsi_bus_poweroff, .restore = scsi_bus_restore, .runtime_suspend = scsi_runtime_suspend, .runtime_resume = scsi_runtime_resume, .runtime_idle = scsi_runtime_idle, }; |
| 333 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * fs/kernfs/kernfs-internal.h - kernfs internal header file * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> */ #ifndef __KERNFS_INTERNAL_H #define __KERNFS_INTERNAL_H #include <linux/lockdep.h> #include <linux/fs.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/kernfs.h> #include <linux/fs_context.h> struct kernfs_iattrs { kuid_t ia_uid; kgid_t ia_gid; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; struct simple_xattrs xattrs; atomic_t nr_user_xattrs; atomic_t user_xattr_size; }; struct kernfs_root { /* published fields */ struct kernfs_node *kn; unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ struct idr ino_idr; u32 last_id_lowbits; u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ struct list_head supers; wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; struct rw_semaphore kernfs_iattr_rwsem; struct rw_semaphore kernfs_supers_rwsem; struct rcu_head rcu; }; /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ /** * kernfs_root - find out the kernfs_root a kernfs_node belongs to * @kn: kernfs_node of interest * * Return: the kernfs_root @kn belongs to. */ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) { /* if parent exists, it's always a dir; otherwise, @sd is a dir */ if (kn->parent) kn = kn->parent; return kn->dir.root; } /* * mount.c */ struct kernfs_super_info { struct super_block *sb; /* * The root associated with this super_block. Each super_block is * identified by the root and ns it's associated with. */ struct kernfs_root *root; /* * Each sb is associated with one namespace tag, currently the * network namespace of the task which mounted this kernfs * instance. If multiple tags become necessary, make the following * an array and compare kernfs_node tag against every entry. */ const void *ns; /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) { if (d_really_is_negative(dentry)) return NULL; return d_inode(dentry)->i_private; } static inline void kernfs_set_rev(struct kernfs_node *parent, struct dentry *dentry) { dentry->d_time = parent->dir.rev; } static inline void kernfs_inc_rev(struct kernfs_node *parent) { parent->dir.rev++; } static inline bool kernfs_dir_changed(struct kernfs_node *parent, struct dentry *dentry) { if (parent->dir.rev != dentry->d_time) return true; return false; } extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c */ extern const struct xattr_handler * const kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); void kernfs_put_active(struct kernfs_node *kn); int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); /* * file.c */ extern const struct file_operations kernfs_file_fops; bool kernfs_should_drain_open_files(struct kernfs_node *kn); void kernfs_drain_open_files(struct kernfs_node *kn); /* * symlink.c */ extern const struct inode_operations kernfs_symlink_iops; /* * kernfs locks */ extern struct kernfs_global_locks *kernfs_locks; #endif /* __KERNFS_INTERNAL_H */ |
| 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_error.h" #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_quota.h" /* * This is the number of entries in the l_buf_cancel_table used during * recovery. */ #define XLOG_BC_TABLE_SIZE 64 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) /* * This structure is used during recovery to record the buf log items which * have been canceled and should not be replayed. */ struct xfs_buf_cancel { xfs_daddr_t bc_blkno; uint bc_len; int bc_refcount; struct list_head bc_list; }; static struct xfs_buf_cancel * xlog_find_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len) { struct list_head *bucket; struct xfs_buf_cancel *bcp; if (!log->l_buf_cancel_table) return NULL; bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); list_for_each_entry(bcp, bucket, bc_list) { if (bcp->bc_blkno == blkno && bcp->bc_len == len) return bcp; } return NULL; } static bool xlog_add_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len) { struct xfs_buf_cancel *bcp; /* * If we find an existing cancel record, this indicates that the buffer * was cancelled multiple times. To ensure that during pass 2 we keep * the record in the table until we reach its last occurrence in the * log, a reference count is kept to tell how many times we expect to * see this record during the second pass. */ bcp = xlog_find_buffer_cancelled(log, blkno, len); if (bcp) { bcp->bc_refcount++; return false; } bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); bcp->bc_blkno = blkno; bcp->bc_len = len; bcp->bc_refcount = 1; list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); return true; } /* * Check if there is and entry for blkno, len in the buffer cancel record table. */ bool xlog_is_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len) { return xlog_find_buffer_cancelled(log, blkno, len) != NULL; } /* * Check if there is and entry for blkno, len in the buffer cancel record table, * and decremented the reference count on it if there is one. * * Remove the cancel record once the refcount hits zero, so that if the same * buffer is re-used again after its last cancellation we actually replay the * changes made at that point. */ static bool xlog_put_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len) { struct xfs_buf_cancel *bcp; bcp = xlog_find_buffer_cancelled(log, blkno, len); if (!bcp) { ASSERT(0); return false; } if (--bcp->bc_refcount == 0) { list_del(&bcp->bc_list); kfree(bcp); } return true; } /* log buffer item recovery */ /* * Sort buffer items for log recovery. Most buffer items should end up on the * buffer list and are recovered first, with the following exceptions: * * 1. XFS_BLF_CANCEL buffers must be processed last because some log items * might depend on the incor ecancellation record, and replaying a cancelled * buffer item can remove the incore record. * * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that * we replay di_next_unlinked only after flushing the inode 'free' state * to the inode buffer. * * See xlog_recover_reorder_trans for more details. */ STATIC enum xlog_recover_reorder xlog_recover_buf_reorder( struct xlog_recover_item *item) { struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; if (buf_f->blf_flags & XFS_BLF_CANCEL) return XLOG_REORDER_CANCEL_LIST; if (buf_f->blf_flags & XFS_BLF_INODE_BUF) return XLOG_REORDER_INODE_BUFFER_LIST; return XLOG_REORDER_BUFFER_LIST; } STATIC void xlog_recover_buf_ra_pass2( struct xlog *log, struct xlog_recover_item *item) { struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); } /* * Build up the table of buf cancel records so that we don't replay cancelled * data in the second pass. */ static int xlog_recover_buf_commit_pass1( struct xlog *log, struct xlog_recover_item *item) { struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { xfs_err(log->l_mp, "bad buffer log item size (%d)", item->ri_buf[0].i_len); return -EFSCORRUPTED; } if (!(bf->blf_flags & XFS_BLF_CANCEL)) trace_xfs_log_recover_buf_not_cancel(log, bf); else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) trace_xfs_log_recover_buf_cancel_add(log, bf); else trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); return 0; } /* * Validate the recovered buffer is of the correct type and attach the * appropriate buffer operations to them for writeback. Magic numbers are in a * few places: * the first 16 bits of the buffer (inode buffer, dquot buffer), * the first 32 bits of the buffer (most blocks), * inside a struct xfs_da_blkinfo at the start of the buffer. */ static void xlog_recover_validate_buf_type( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f, xfs_lsn_t current_lsn) { struct xfs_da_blkinfo *info = bp->b_addr; uint32_t magic32; uint16_t magic16; uint16_t magicda; char *warnmsg = NULL; /* * We can only do post recovery validation on items on CRC enabled * fielsystems as we need to know when the buffer was written to be able * to determine if we should have replayed the item. If we replay old * metadata over a newer buffer, then it will enter a temporarily * inconsistent state resulting in verification failures. Hence for now * just avoid the verification stage for non-crc filesystems */ if (!xfs_has_crc(mp)) return; magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); magic16 = be16_to_cpu(*(__be16*)bp->b_addr); magicda = be16_to_cpu(info->magic); switch (xfs_blft_from_flags(buf_f)) { case XFS_BLFT_BTREE_BUF: switch (magic32) { case XFS_ABTB_CRC_MAGIC: case XFS_ABTB_MAGIC: bp->b_ops = &xfs_bnobt_buf_ops; break; case XFS_ABTC_CRC_MAGIC: case XFS_ABTC_MAGIC: bp->b_ops = &xfs_cntbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; case XFS_FIBT_CRC_MAGIC: case XFS_FIBT_MAGIC: bp->b_ops = &xfs_finobt_buf_ops; break; case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; break; case XFS_RMAP_CRC_MAGIC: bp->b_ops = &xfs_rmapbt_buf_ops; break; case XFS_REFC_CRC_MAGIC: bp->b_ops = &xfs_refcountbt_buf_ops; break; default: warnmsg = "Bad btree block magic!"; break; } break; case XFS_BLFT_AGF_BUF: if (magic32 != XFS_AGF_MAGIC) { warnmsg = "Bad AGF block magic!"; break; } bp->b_ops = &xfs_agf_buf_ops; break; case XFS_BLFT_AGFL_BUF: if (magic32 != XFS_AGFL_MAGIC) { warnmsg = "Bad AGFL block magic!"; break; } bp->b_ops = &xfs_agfl_buf_ops; break; case XFS_BLFT_AGI_BUF: if (magic32 != XFS_AGI_MAGIC) { warnmsg = "Bad AGI block magic!"; break; } bp->b_ops = &xfs_agi_buf_ops; break; case XFS_BLFT_UDQUOT_BUF: case XFS_BLFT_PDQUOT_BUF: case XFS_BLFT_GDQUOT_BUF: #ifdef CONFIG_XFS_QUOTA if (magic16 != XFS_DQUOT_MAGIC) { warnmsg = "Bad DQUOT block magic!"; break; } bp->b_ops = &xfs_dquot_buf_ops; #else xfs_alert(mp, "Trying to recover dquots without QUOTA support built in!"); ASSERT(0); #endif break; case XFS_BLFT_DINO_BUF: if (magic16 != XFS_DINODE_MAGIC) { warnmsg = "Bad INODE block magic!"; break; } bp->b_ops = &xfs_inode_buf_ops; break; case XFS_BLFT_SYMLINK_BUF: if (magic32 != XFS_SYMLINK_MAGIC) { warnmsg = "Bad symlink block magic!"; break; } bp->b_ops = &xfs_symlink_buf_ops; break; case XFS_BLFT_DIR_BLOCK_BUF: if (magic32 != XFS_DIR2_BLOCK_MAGIC && magic32 != XFS_DIR3_BLOCK_MAGIC) { warnmsg = "Bad dir block magic!"; break; } bp->b_ops = &xfs_dir3_block_buf_ops; break; case XFS_BLFT_DIR_DATA_BUF: if (magic32 != XFS_DIR2_DATA_MAGIC && magic32 != XFS_DIR3_DATA_MAGIC) { warnmsg = "Bad dir data magic!"; break; } bp->b_ops = &xfs_dir3_data_buf_ops; break; case XFS_BLFT_DIR_FREE_BUF: if (magic32 != XFS_DIR2_FREE_MAGIC && magic32 != XFS_DIR3_FREE_MAGIC) { warnmsg = "Bad dir3 free magic!"; break; } bp->b_ops = &xfs_dir3_free_buf_ops; break; case XFS_BLFT_DIR_LEAF1_BUF: if (magicda != XFS_DIR2_LEAF1_MAGIC && magicda != XFS_DIR3_LEAF1_MAGIC) { warnmsg = "Bad dir leaf1 magic!"; break; } bp->b_ops = &xfs_dir3_leaf1_buf_ops; break; case XFS_BLFT_DIR_LEAFN_BUF: if (magicda != XFS_DIR2_LEAFN_MAGIC && magicda != XFS_DIR3_LEAFN_MAGIC) { warnmsg = "Bad dir leafn magic!"; break; } bp->b_ops = &xfs_dir3_leafn_buf_ops; break; case XFS_BLFT_DA_NODE_BUF: if (magicda != XFS_DA_NODE_MAGIC && magicda != XFS_DA3_NODE_MAGIC) { warnmsg = "Bad da node magic!"; break; } bp->b_ops = &xfs_da3_node_buf_ops; break; case XFS_BLFT_ATTR_LEAF_BUF: if (magicda != XFS_ATTR_LEAF_MAGIC && magicda != XFS_ATTR3_LEAF_MAGIC) { warnmsg = "Bad attr leaf magic!"; break; } bp->b_ops = &xfs_attr3_leaf_buf_ops; break; case XFS_BLFT_ATTR_RMT_BUF: if (magic32 != XFS_ATTR3_RMT_MAGIC) { warnmsg = "Bad attr remote magic!"; break; } bp->b_ops = &xfs_attr3_rmt_buf_ops; break; case XFS_BLFT_SB_BUF: if (magic32 != XFS_SB_MAGIC) { warnmsg = "Bad SB block magic!"; break; } bp->b_ops = &xfs_sb_buf_ops; break; #ifdef CONFIG_XFS_RT case XFS_BLFT_RTBITMAP_BUF: case XFS_BLFT_RTSUMMARY_BUF: /* no magic numbers for verification of RT buffers */ bp->b_ops = &xfs_rtbuf_ops; break; #endif /* CONFIG_XFS_RT */ default: xfs_warn(mp, "Unknown buffer type %d!", xfs_blft_from_flags(buf_f)); break; } /* * Nothing else to do in the case of a NULL current LSN as this means * the buffer is more recent than the change in the log and will be * skipped. */ if (current_lsn == NULLCOMMITLSN) return; if (warnmsg) { xfs_warn(mp, warnmsg); ASSERT(0); } /* * We must update the metadata LSN of the buffer as it is written out to * ensure that older transactions never replay over this one and corrupt * the buffer. This can occur if log recovery is interrupted at some * point after the current transaction completes, at which point a * subsequent mount starts recovery from the beginning. * * Write verifiers update the metadata LSN from log items attached to * the buffer. Therefore, initialize a bli purely to carry the LSN to * the verifier. */ if (bp->b_ops) { struct xfs_buf_log_item *bip; bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_item_init(bp, mp); bip = bp->b_log_item; bip->bli_item.li_lsn = current_lsn; } } /* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ STATIC void xlog_recover_do_reg_buffer( struct xfs_mount *mp, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f, xfs_lsn_t current_lsn) { int i; int bit; int nbits; xfs_failaddr_t fa; const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); bit = 0; i = 1; /* 0 is the buf format structure */ while (1) { bit = xfs_next_bit(buf_f->blf_data_map, buf_f->blf_map_size, bit); if (bit == -1) break; nbits = xfs_contig_bits(buf_f->blf_data_map, buf_f->blf_map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); ASSERT(BBTOB(bp->b_length) >= ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* * The dirty regions logged in the buffer, even though * contiguous, may span multiple chunks. This is because the * dirty region may span a physical page boundary in a buffer * and hence be split into two separate vectors for writing into * the log. Hence we need to trim nbits back to the length of * the current region being copied out of the log. */ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. */ fa = NULL; if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { xfs_alert(mp, "XFS: NULL dquot in %s.", __func__); goto next; } if (item->ri_buf[i].i_len < size_disk_dquot) { xfs_alert(mp, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); goto next; } fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", fa, xfs_buf_daddr(bp)); goto next; } } memcpy(xfs_buf_offset(bp, (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ nbits<<XFS_BLF_SHIFT); /* length */ next: i++; bit += nbits; } /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); } /* * Perform a dquot buffer recovery. * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. * * Return false if the buffer was tossed and true if we recovered the buffer to * indicate to the caller if the buffer needs writing. */ STATIC bool xlog_recover_do_dquot_buffer( struct xfs_mount *mp, struct xlog *log, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) { uint type; trace_xfs_log_recover_buf_dquot_buf(log, buf_f); /* * Filesystems are required to send in quota flags at mount time. */ if (!mp->m_qflags) return false; type = 0; if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQTYPE_USER; if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQTYPE_PROJ; if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQTYPE_GROUP; /* * This type of quotas was turned off, so ignore this buffer */ if (log->l_quotaoffs_flag & type) return false; xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); return true; } /* * Perform recovery for a buffer full of inodes. In these buffers, the only * data which should be recovered is that which corresponds to the * di_next_unlinked pointers in the on disk inode structures. The rest of the * data for the inodes is always logged through the inodes themselves rather * than the inode buffer and is recovered in xlog_recover_inode_pass2(). * * The only time when buffers full of inodes are fully recovered is when the * buffer is full of newly allocated inodes. In this case the buffer will * not be marked as an inode buffer and so will be sent to * xlog_recover_do_reg_buffer() below during recovery. */ STATIC int xlog_recover_do_inode_buffer( struct xfs_mount *mp, struct xlog_recover_item *item, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) { int i; int item_index = 0; int bit = 0; int nbits = 0; int reg_buf_offset = 0; int reg_buf_bytes = 0; int next_unlinked_offset; int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); /* * Post recovery validation only works properly on CRC enabled * filesystems. */ if (xfs_has_crc(mp)) bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(struct xfs_dinode, di_next_unlinked); while (next_unlinked_offset >= (reg_buf_offset + reg_buf_bytes)) { /* * The next di_next_unlinked field is beyond * the current logged region. Find the next * logged region that contains or is beyond * the current di_next_unlinked field. */ bit += nbits; bit = xfs_next_bit(buf_f->blf_data_map, buf_f->blf_map_size, bit); /* * If there are no more logged regions in the * buffer, then we're done. */ if (bit == -1) return 0; nbits = xfs_contig_bits(buf_f->blf_data_map, buf_f->blf_map_size, bit); ASSERT(nbits > 0); reg_buf_offset = bit << XFS_BLF_SHIFT; reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } /* * If the current logged region starts after the current * di_next_unlinked field, then move on to the next * di_next_unlinked field. */ if (next_unlinked_offset < reg_buf_offset) continue; ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); /* * The current logged region contains a copy of the * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ logged_nextp = item->ri_buf[item_index].i_addr + next_unlinked_offset - reg_buf_offset; if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { xfs_alert(mp, "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); return -EFSCORRUPTED; } buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; /* * If necessary, recalculate the CRC in the on-disk inode. We * have to leave the inode in a consistent state for whoever * reads it next.... */ xfs_dinode_calc_crc(mp, xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); } return 0; } /* * V5 filesystems know the age of the buffer on disk being recovered. We can * have newer objects on disk than we are replaying, and so for these cases we * don't want to replay the current change as that will make the buffer contents * temporarily invalid on disk. * * The magic number might not match the buffer type we are going to recover * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence * extract the LSN of the existing object in the buffer based on it's current * magic number. If we don't recognise the magic number in the buffer, then * return a LSN of -1 so that the caller knows it was an unrecognised block and * so can recover the buffer. * * Note: we cannot rely solely on magic number matches to determine that the * buffer has a valid LSN - we also need to verify that it belongs to this * filesystem, so we need to extract the object's LSN and compare it to that * which we read from the superblock. If the UUIDs don't match, then we've got a * stale metadata block from an old filesystem instance that we need to recover * over the top of. */ static xfs_lsn_t xlog_recover_get_buf_lsn( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_buf_log_format *buf_f) { uint32_t magic32; uint16_t magic16; uint16_t magicda; void *blk = bp->b_addr; uuid_t *uuid; xfs_lsn_t lsn = -1; uint16_t blft; /* v4 filesystems always recover immediately */ if (!xfs_has_crc(mp)) goto recover_immediately; /* * realtime bitmap and summary file blocks do not have magic numbers or * UUIDs, so we must recover them immediately. */ blft = xfs_blft_from_flags(buf_f); if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF) goto recover_immediately; magic32 = be32_to_cpu(*(__be32 *)blk); switch (magic32) { case XFS_ABTB_CRC_MAGIC: case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: case XFS_REFC_CRC_MAGIC: case XFS_FIBT_CRC_MAGIC: case XFS_FIBT_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); uuid = &btb->bb_u.s.bb_uuid; break; } case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: { struct xfs_btree_block *btb = blk; lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); uuid = &btb->bb_u.l.bb_uuid; break; } case XFS_AGF_MAGIC: lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); uuid = &((struct xfs_agf *)blk)->agf_uuid; break; case XFS_AGFL_MAGIC: lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); uuid = &((struct xfs_agfl *)blk)->agfl_uuid; break; case XFS_AGI_MAGIC: lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); uuid = &((struct xfs_agi *)blk)->agi_uuid; break; case XFS_SYMLINK_MAGIC: lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; break; case XFS_DIR3_BLOCK_MAGIC: case XFS_DIR3_DATA_MAGIC: case XFS_DIR3_FREE_MAGIC: lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; break; case XFS_ATTR3_RMT_MAGIC: /* * Remote attr blocks are written synchronously, rather than * being logged. That means they do not contain a valid LSN * (i.e. transactionally ordered) in them, and hence any time we * see a buffer to replay over the top of a remote attribute * block we should simply do so. */ goto recover_immediately; case XFS_SB_MAGIC: /* * superblock uuids are magic. We may or may not have a * sb_meta_uuid on disk, but it will be set in the in-core * superblock. We set the uuid pointer for verification * according to the superblock feature mask to ensure we check * the relevant UUID in the superblock. */ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); if (xfs_has_metauuid(mp)) uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; else uuid = &((struct xfs_dsb *)blk)->sb_uuid; break; default: break; } if (lsn != (xfs_lsn_t)-1) { if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); switch (magicda) { case XFS_DIR3_LEAF1_MAGIC: case XFS_DIR3_LEAFN_MAGIC: case XFS_ATTR3_LEAF_MAGIC: case XFS_DA3_NODE_MAGIC: lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; break; default: break; } if (lsn != (xfs_lsn_t)-1) { if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) goto recover_immediately; return lsn; } /* * We do individual object checks on dquot and inode buffers as they * have their own individual LSN records. Also, we could have a stale * buffer here, so we have to at least recognise these buffer types. * * A notd complexity here is inode unlinked list processing - it logs * the inode directly in the buffer, but we don't know which inodes have * been modified, and there is no global buffer LSN. Hence we need to * recover all inode buffer types immediately. This problem will be * fixed by logical logging of the unlinked list modifications. */ magic16 = be16_to_cpu(*(__be16 *)blk); switch (magic16) { case XFS_DQUOT_MAGIC: case XFS_DINODE_MAGIC: goto recover_immediately; default: break; } /* unknown buffer contents, recover immediately */ recover_immediately: return (xfs_lsn_t)-1; } /* * This routine replays a modification made to a buffer at runtime. * There are actually two types of buffer, regular and inode, which * are handled differently. Inode buffers are handled differently * in that we only recover a specific set of data from them, namely * the inode di_next_unlinked fields. This is because all other inode * data is actually logged via inode records and any data we replay * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed * meta-data into a user's file. * * To handle the cancellation of buffer log items, we make two passes * over the log during recovery. During the first we build a table of * those buffers which have been cancelled, and during the second we * only replay those buffers which do not have corresponding cancel * records in the table. See xlog_recover_buf_pass[1,2] above * for more details on the implementation of the table of cancel records. */ STATIC int xlog_recover_buf_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t current_lsn) { struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; uint buf_flags; xfs_lsn_t lsn; /* * In this pass we only want to recover all the buffers which have * not been cancelled and are not cancellation buffers themselves. */ if (buf_f->blf_flags & XFS_BLF_CANCEL) { if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, buf_f->blf_len)) goto cancelled; } else { if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, buf_f->blf_len)) goto cancelled; } trace_xfs_log_recover_buf_recover(log, buf_f); buf_flags = 0; if (buf_f->blf_flags & XFS_BLF_INODE_BUF) buf_flags |= XBF_UNMAPPED; error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, buf_flags, &bp, NULL); if (error) return error; /* * Recover the buffer only if we get an LSN from it and it's less than * the lsn of the transaction we are replaying. * * Note that we have to be extremely careful of readahead here. * Readahead does not attach verfiers to the buffers so if we don't * actually do any replay after readahead because of the LSN we found * in the buffer if more recent than that current transaction then we * need to attach the verifier directly. Failure to do so can lead to * future recovery actions (e.g. EFI and unlinked list recovery) can * operate on the buffers and they won't get the verifier attached. This * can lead to blocks on disk having the correct content but a stale * CRC. * * It is safe to assume these clean buffers are currently up to date. * If the buffer is dirtied by a later transaction being replayed, then * the verifier will be reset to match whatever recover turns that * buffer into. */ lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_buf_skip(log, buf_f); xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); /* * We're skipping replay of this buffer log item due to the log * item LSN being behind the ondisk buffer. Verify the buffer * contents since we aren't going to run the write verifier. */ if (bp->b_ops) { bp->b_ops->verify_read(bp); error = bp->b_error; } goto out_release; } if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); if (error) goto out_release; } else if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { bool dirty; dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); if (!dirty) goto out_release; } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); } /* * Perform delayed write on the buffer. Asynchronous writes will be * slower when taking into account all the buffers to be flushed. * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block * or inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if * the inode buffer size isn't max(blocksize, inode_cluster_size) * for *our* value of inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); } out_release: xfs_buf_relse(bp); return error; cancelled: trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; } const struct xlog_recover_item_ops xlog_buf_item_ops = { .item_type = XFS_LI_BUF, .reorder = xlog_recover_buf_reorder, .ra_pass2 = xlog_recover_buf_ra_pass2, .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; #ifdef DEBUG void xlog_check_buf_cancel_table( struct xlog *log) { int i; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) ASSERT(list_empty(&log->l_buf_cancel_table[i])); } #endif int xlog_alloc_buf_cancel_table( struct xlog *log) { void *p; int i; ASSERT(log->l_buf_cancel_table == NULL); p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!p) return -ENOMEM; log->l_buf_cancel_table = p; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); return 0; } void xlog_free_buf_cancel_table( struct xlog *log) { int i; if (!log->l_buf_cancel_table) return; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { struct xfs_buf_cancel *bc; while ((bc = list_first_entry_or_null( &log->l_buf_cancel_table[i], struct xfs_buf_cancel, bc_list))) { list_del(&bc->bc_list); kfree(bc); } } kfree(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; } |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 | // SPDX-License-Identifier: GPL-2.0 /* * (C) Copyright 2002-2004, 2007 Greg Kroah-Hartman <greg@kroah.com> * (C) Copyright 2007 Novell Inc. */ #include <linux/pci.h> #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> #include <linux/mempolicy.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/isolation.h> #include <linux/cpu.h> #include <linux/pm_runtime.h> #include <linux/suspend.h> #include <linux/kexec.h> #include <linux/of_device.h> #include <linux/acpi.h> #include <linux/dma-map-ops.h> #include <linux/iommu.h> #include "pci.h" #include "pcie/portdrv.h" struct pci_dynid { struct list_head node; struct pci_device_id id; }; /** * pci_add_dynid - add a new PCI device ID to this driver and re-probe devices * @drv: target pci driver * @vendor: PCI vendor ID * @device: PCI device ID * @subvendor: PCI subvendor ID * @subdevice: PCI subdevice ID * @class: PCI class * @class_mask: PCI class mask * @driver_data: private driver data * * Adds a new dynamic pci device ID to this driver and causes the * driver to probe for all devices again. @drv must have been * registered prior to calling this function. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * 0 on success, -errno on failure. */ int pci_add_dynid(struct pci_driver *drv, unsigned int vendor, unsigned int device, unsigned int subvendor, unsigned int subdevice, unsigned int class, unsigned int class_mask, unsigned long driver_data) { struct pci_dynid *dynid; dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); if (!dynid) return -ENOMEM; dynid->id.vendor = vendor; dynid->id.device = device; dynid->id.subvendor = subvendor; dynid->id.subdevice = subdevice; dynid->id.class = class; dynid->id.class_mask = class_mask; dynid->id.driver_data = driver_data; spin_lock(&drv->dynids.lock); list_add_tail(&dynid->node, &drv->dynids.list); spin_unlock(&drv->dynids.lock); return driver_attach(&drv->driver); } EXPORT_SYMBOL_GPL(pci_add_dynid); static void pci_free_dynids(struct pci_driver *drv) { struct pci_dynid *dynid, *n; spin_lock(&drv->dynids.lock); list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) { list_del(&dynid->node); kfree(dynid); } spin_unlock(&drv->dynids.lock); } /** * pci_match_id - See if a PCI device matches a given pci_id table * @ids: array of PCI device ID structures to search in * @dev: the PCI device structure to match against. * * Used by a driver to check whether a PCI device is in its list of * supported devices. Returns the matching pci_device_id structure or * %NULL if there is no match. * * Deprecated; don't use this as it will not catch any dynamic IDs * that a driver might want to check for. */ const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, struct pci_dev *dev) { if (ids) { while (ids->vendor || ids->subvendor || ids->class_mask) { if (pci_match_one_device(ids, dev)) return ids; ids++; } } return NULL; } EXPORT_SYMBOL(pci_match_id); static const struct pci_device_id pci_device_id_any = { .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, }; /** * pci_match_device - See if a device matches a driver's list of IDs * @drv: the PCI driver to match against * @dev: the PCI device structure to match against * * Used by a driver to check whether a PCI device is in its list of * supported devices or in the dynids list, which may have been augmented * via the sysfs "new_id" file. Returns the matching pci_device_id * structure or %NULL if there is no match. */ static const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev) { struct pci_dynid *dynid; const struct pci_device_id *found_id = NULL, *ids; /* When driver_override is set, only bind to the matching driver */ if (dev->driver_override && strcmp(dev->driver_override, drv->name)) return NULL; /* Look at the dynamic ids first, before the static ones */ spin_lock(&drv->dynids.lock); list_for_each_entry(dynid, &drv->dynids.list, node) { if (pci_match_one_device(&dynid->id, dev)) { found_id = &dynid->id; break; } } spin_unlock(&drv->dynids.lock); if (found_id) return found_id; for (ids = drv->id_table; (found_id = pci_match_id(ids, dev)); ids = found_id + 1) { /* * The match table is split based on driver_override. * In case override_only was set, enforce driver_override * matching. */ if (found_id->override_only) { if (dev->driver_override) return found_id; } else { return found_id; } } /* driver_override will always match, send a dummy id */ if (dev->driver_override) return &pci_device_id_any; return NULL; } /** * new_id_store - sysfs frontend to pci_add_dynid() * @driver: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Allow PCI IDs to be added to an existing driver via sysfs. */ static ssize_t new_id_store(struct device_driver *driver, const char *buf, size_t count) { struct pci_driver *pdrv = to_pci_driver(driver); const struct pci_device_id *ids = pdrv->id_table; u32 vendor, device, subvendor = PCI_ANY_ID, subdevice = PCI_ANY_ID, class = 0, class_mask = 0; unsigned long driver_data = 0; int fields; int retval = 0; fields = sscanf(buf, "%x %x %x %x %x %x %lx", &vendor, &device, &subvendor, &subdevice, &class, &class_mask, &driver_data); if (fields < 2) return -EINVAL; if (fields != 7) { struct pci_dev *pdev = kzalloc(sizeof(*pdev), GFP_KERNEL); if (!pdev) return -ENOMEM; pdev->vendor = vendor; pdev->device = device; pdev->subsystem_vendor = subvendor; pdev->subsystem_device = subdevice; pdev->class = class; if (pci_match_device(pdrv, pdev)) retval = -EEXIST; kfree(pdev); if (retval) return retval; } /* Only accept driver_data values that match an existing id_table entry */ if (ids) { retval = -EINVAL; while (ids->vendor || ids->subvendor || ids->class_mask) { if (driver_data == ids->driver_data) { retval = 0; break; } ids++; } if (retval) /* No match */ return retval; } retval = pci_add_dynid(pdrv, vendor, device, subvendor, subdevice, class, class_mask, driver_data); if (retval) return retval; return count; } static DRIVER_ATTR_WO(new_id); /** * remove_id_store - remove a PCI device ID from this driver * @driver: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Removes a dynamic pci device ID to this driver. */ static ssize_t remove_id_store(struct device_driver *driver, const char *buf, size_t count) { struct pci_dynid *dynid, *n; struct pci_driver *pdrv = to_pci_driver(driver); u32 vendor, device, subvendor = PCI_ANY_ID, subdevice = PCI_ANY_ID, class = 0, class_mask = 0; int fields; size_t retval = -ENODEV; fields = sscanf(buf, "%x %x %x %x %x %x", &vendor, &device, &subvendor, &subdevice, &class, &class_mask); if (fields < 2) return -EINVAL; spin_lock(&pdrv->dynids.lock); list_for_each_entry_safe(dynid, n, &pdrv->dynids.list, node) { struct pci_device_id *id = &dynid->id; if ((id->vendor == vendor) && (id->device == device) && (subvendor == PCI_ANY_ID || id->subvendor == subvendor) && (subdevice == PCI_ANY_ID || id->subdevice == subdevice) && !((id->class ^ class) & class_mask)) { list_del(&dynid->node); kfree(dynid); retval = count; break; } } spin_unlock(&pdrv->dynids.lock); return retval; } static DRIVER_ATTR_WO(remove_id); static struct attribute *pci_drv_attrs[] = { &driver_attr_new_id.attr, &driver_attr_remove_id.attr, NULL, }; ATTRIBUTE_GROUPS(pci_drv); struct drv_dev_and_id { struct pci_driver *drv; struct pci_dev *dev; const struct pci_device_id *id; }; static long local_pci_probe(void *_ddi) { struct drv_dev_and_id *ddi = _ddi; struct pci_dev *pci_dev = ddi->dev; struct pci_driver *pci_drv = ddi->drv; struct device *dev = &pci_dev->dev; int rc; /* * Unbound PCI devices are always put in D0, regardless of * runtime PM status. During probe, the device is set to * active and the usage count is incremented. If the driver * supports runtime PM, it should call pm_runtime_put_noidle(), * or any other runtime PM helper function decrementing the usage * count, in its probe routine and pm_runtime_get_noresume() in * its remove routine. */ pm_runtime_get_sync(dev); pci_dev->driver = pci_drv; rc = pci_drv->probe(pci_dev, ddi->id); if (!rc) return rc; if (rc < 0) { pci_dev->driver = NULL; pm_runtime_put_sync(dev); return rc; } /* * Probe function should return < 0 for failure, 0 for success * Treat values > 0 as success, but warn. */ pci_warn(pci_dev, "Driver probe function unexpectedly returned %d\n", rc); return 0; } static bool pci_physfn_is_probed(struct pci_dev *dev) { #ifdef CONFIG_PCI_IOV return dev->is_virtfn && dev->physfn->is_probed; #else return false; #endif } static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, const struct pci_device_id *id) { int error, node, cpu; struct drv_dev_and_id ddi = { drv, dev, id }; /* * Execute driver initialization on node where the device is * attached. This way the driver likely allocates its local memory * on the right node. */ node = dev_to_node(&dev->dev); dev->is_probed = 1; cpu_hotplug_disable(); /* * Prevent nesting work_on_cpu() for the case where a Virtual Function * device is probed from work_on_cpu() of the Physical device. */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node) || pci_physfn_is_probed(dev)) { cpu = nr_cpu_ids; } else { cpumask_var_t wq_domain_mask; if (!zalloc_cpumask_var(&wq_domain_mask, GFP_KERNEL)) { error = -ENOMEM; goto out; } cpumask_and(wq_domain_mask, housekeeping_cpumask(HK_TYPE_WQ), housekeeping_cpumask(HK_TYPE_DOMAIN)); cpu = cpumask_any_and(cpumask_of_node(node), wq_domain_mask); free_cpumask_var(wq_domain_mask); } if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); out: dev->is_probed = 0; cpu_hotplug_enable(); return error; } /** * __pci_device_probe - check if a driver wants to claim a specific PCI device * @drv: driver to call to check if it wants the PCI device * @pci_dev: PCI device being probed * * returns 0 on success, else error. * side-effect: pci_dev->driver is set to drv when drv claims pci_dev. */ static int __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev) { const struct pci_device_id *id; int error = 0; if (drv->probe) { error = -ENODEV; id = pci_match_device(drv, pci_dev); if (id) error = pci_call_probe(drv, pci_dev, id); } return error; } #ifdef CONFIG_PCI_IOV static inline bool pci_device_can_probe(struct pci_dev *pdev) { return (!pdev->is_virtfn || pdev->physfn->sriov->drivers_autoprobe || pdev->driver_override); } #else static inline bool pci_device_can_probe(struct pci_dev *pdev) { return true; } #endif static int pci_device_probe(struct device *dev) { int error; struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = to_pci_driver(dev->driver); if (!pci_device_can_probe(pci_dev)) return -ENODEV; pci_assign_irq(pci_dev); error = pcibios_alloc_irq(pci_dev); if (error < 0) return error; pci_dev_get(pci_dev); error = __pci_device_probe(drv, pci_dev); if (error) { pcibios_free_irq(pci_dev); pci_dev_put(pci_dev); } return error; } static void pci_device_remove(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; if (drv->remove) { pm_runtime_get_sync(dev); /* * If the driver provides a .runtime_idle() callback and it has * started to run already, it may continue to run in parallel * with the code below, so wait until all of the runtime PM * activity has completed. */ pm_runtime_barrier(dev); drv->remove(pci_dev); pm_runtime_put_noidle(dev); } pcibios_free_irq(pci_dev); pci_dev->driver = NULL; pci_iov_remove(pci_dev); /* Undo the runtime PM settings in local_pci_probe() */ pm_runtime_put_sync(dev); /* * If the device is still on, set the power state as "unknown", * since it might change by the next time we load the driver. */ if (pci_dev->current_state == PCI_D0) pci_dev->current_state = PCI_UNKNOWN; /* * We would love to complain here if pci_dev->is_enabled is set, that * the driver should have called pci_disable_device(), but the * unfortunate fact is there are too many odd BIOS and bridge setups * that don't like drivers doing that all of the time. * Oh well, we can dream of sane hardware when we sleep, no matter how * horrible the crap we have to deal with is when we are awake... */ pci_dev_put(pci_dev); } static void pci_device_shutdown(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; pm_runtime_resume(dev); if (drv && drv->shutdown) drv->shutdown(pci_dev); /* * If this is a kexec reboot, turn off Bus Master bit on the * device to tell it to not continue to do DMA. Don't touch * devices in D3cold or unknown states. * If it is not a kexec reboot, firmware will hit the PCI * devices with big hammer and stop their DMA any way. */ if (kexec_in_progress && (pci_dev->current_state <= PCI_D3hot)) pci_clear_master(pci_dev); } #ifdef CONFIG_PM_SLEEP /* Auxiliary functions used for system resume */ /** * pci_restore_standard_config - restore standard config registers of PCI device * @pci_dev: PCI device to handle */ static int pci_restore_standard_config(struct pci_dev *pci_dev) { pci_update_current_state(pci_dev, PCI_UNKNOWN); if (pci_dev->current_state != PCI_D0) { int error = pci_set_power_state(pci_dev, PCI_D0); if (error) return error; } pci_restore_state(pci_dev); pci_pme_restore(pci_dev); return 0; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM /* Auxiliary functions used for system resume and run-time resume */ static void pci_pm_default_resume(struct pci_dev *pci_dev) { pci_fixup_device(pci_fixup_resume, pci_dev); pci_enable_wake(pci_dev, PCI_D0, false); } static void pci_pm_power_up_and_verify_state(struct pci_dev *pci_dev) { pci_power_up(pci_dev); pci_update_current_state(pci_dev, PCI_D0); } static void pci_pm_default_resume_early(struct pci_dev *pci_dev) { pci_pm_power_up_and_verify_state(pci_dev); pci_restore_state(pci_dev); pci_pme_restore(pci_dev); } static void pci_pm_bridge_power_up_actions(struct pci_dev *pci_dev) { int ret; ret = pci_bridge_wait_for_secondary_bus(pci_dev, "resume"); if (ret) { /* * The downstream link failed to come up, so mark the * devices below as disconnected to make sure we don't * attempt to resume them. */ pci_walk_bus(pci_dev->subordinate, pci_dev_set_disconnected, NULL); return; } /* * When powering on a bridge from D3cold, the whole hierarchy may be * powered on into D0uninitialized state, resume them to give them a * chance to suspend again */ pci_resume_bus(pci_dev->subordinate); } #endif /* CONFIG_PM */ #ifdef CONFIG_PM_SLEEP /* * Default "suspend" method for devices that have no driver provided suspend, * or not even a driver at all (second part). */ static void pci_pm_set_unknown_state(struct pci_dev *pci_dev) { /* * mark its power state as "unknown", since we don't know if * e.g. the BIOS will change its device state when we suspend. */ if (pci_dev->current_state == PCI_D0) pci_dev->current_state = PCI_UNKNOWN; } /* * Default "resume" method for devices that have no driver provided resume, * or not even a driver at all (second part). */ static int pci_pm_reenable_device(struct pci_dev *pci_dev) { int retval; /* if the device was enabled before suspend, re-enable */ retval = pci_reenable_device(pci_dev); /* * if the device was busmaster before the suspend, make it busmaster * again */ if (pci_dev->is_busmaster) pci_set_master(pci_dev); return retval; } static int pci_legacy_suspend(struct device *dev, pm_message_t state) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; if (drv && drv->suspend) { pci_power_t prev = pci_dev->current_state; int error; error = drv->suspend(pci_dev, state); suspend_report_result(dev, drv->suspend, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: Device state not saved by %pS\n", drv->suspend); } } pci_fixup_device(pci_fixup_suspend, pci_dev); return 0; } static int pci_legacy_suspend_late(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); if (!pci_dev->state_saved) pci_save_state(pci_dev); pci_pm_set_unknown_state(pci_dev); pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } static int pci_legacy_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *drv = pci_dev->driver; pci_fixup_device(pci_fixup_resume, pci_dev); return drv && drv->resume ? drv->resume(pci_dev) : pci_pm_reenable_device(pci_dev); } /* Auxiliary functions used by the new power management framework */ static void pci_pm_default_suspend(struct pci_dev *pci_dev) { /* Disable non-bridge devices without PM support */ if (!pci_has_subordinate(pci_dev)) pci_disable_enabled_device(pci_dev); } static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev) { struct pci_driver *drv = pci_dev->driver; bool ret = drv && (drv->suspend || drv->resume); /* * Legacy PM support is used by default, so warn if the new framework is * supported as well. Drivers are supposed to support either the * former, or the latter, but not both at the same time. */ pci_WARN(pci_dev, ret && drv->driver.pm, "device %04x:%04x\n", pci_dev->vendor, pci_dev->device); return ret; } /* New power management framework */ static int pci_pm_prepare(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pm && pm->prepare) { int error = pm->prepare(dev); if (error < 0) return error; if (!error && dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_PREPARE)) return 0; } if (pci_dev_need_resume(pci_dev)) return 0; /* * The PME setting needs to be adjusted here in case the direct-complete * optimization is used with respect to this device. */ pci_dev_adjust_pme(pci_dev); return 1; } static void pci_pm_complete(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); pci_dev_complete_resume(pci_dev); pm_generic_complete(dev); /* Resume device if platform firmware has put it in reset-power-on */ if (pm_runtime_suspended(dev) && pm_resume_via_firmware()) { pci_power_t pre_sleep_state = pci_dev->current_state; pci_refresh_power_state(pci_dev); /* * On platforms with ACPI this check may also trigger for * devices sharing power resources if one of those power * resources has been activated as a result of a change of the * power state of another device sharing it. However, in that * case it is also better to resume the device, in general. */ if (pci_dev->current_state < pre_sleep_state) pm_request_resume(dev); } } #else /* !CONFIG_PM_SLEEP */ #define pci_pm_prepare NULL #define pci_pm_complete NULL #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_SUSPEND static void pcie_pme_root_status_cleanup(struct pci_dev *pci_dev) { /* * Some BIOSes forget to clear Root PME Status bits after system * wakeup, which breaks ACPI-based runtime wakeup on PCI Express. * Clear those bits now just in case (shouldn't hurt). */ if (pci_is_pcie(pci_dev) && (pci_pcie_type(pci_dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(pci_dev) == PCI_EXP_TYPE_RC_EC)) pcie_clear_root_pme_status(pci_dev); } static int pci_pm_suspend(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_dev->skip_bus_pm = false; /* * Disabling PTM allows some systems, e.g., Intel mobile chips * since Coffee Lake, to enter a lower-power PM state. */ pci_suspend_ptm(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_SUSPEND); if (!pm) { pci_pm_default_suspend(pci_dev); return 0; } /* * PCI devices suspended at run time may need to be resumed at this * point, because in general it may be necessary to reconfigure them for * system suspend. Namely, if the device is expected to wake up the * system from the sleep state, it may have to be reconfigured for this * purpose, or if the device is not expected to wake up the system from * the sleep state, it should be prevented from signaling wakeup events * going forward. * * Also if the driver of the device does not indicate that its system * suspend callbacks can cope with runtime-suspended devices, it is * better to resume the device from runtime suspend here. */ if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) || pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; } else { pci_dev_adjust_pme(pci_dev); } if (pm->suspend) { pci_power_t prev = pci_dev->current_state; int error; error = pm->suspend(dev); suspend_report_result(dev, pm->suspend, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->suspend); } } return 0; } static int pci_pm_suspend_late(struct device *dev) { if (dev_pm_skip_suspend(dev)) return 0; pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev)); return pm_generic_suspend_late(dev); } static int pci_pm_suspend_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (dev_pm_skip_suspend(dev)) return 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (!pm) { pci_save_state(pci_dev); goto Fixup; } if (pm->suspend_noirq) { pci_power_t prev = pci_dev->current_state; int error; error = pm->suspend_noirq(dev); suspend_report_result(dev, pm->suspend_noirq, error); if (error) return error; if (!pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->suspend_noirq); goto Fixup; } } if (!pci_dev->state_saved) { pci_save_state(pci_dev); /* * If the device is a bridge with a child in D0 below it, * it needs to stay in D0, so check skip_bus_pm to avoid * putting it into a low-power state in that case. */ if (!pci_dev->skip_bus_pm && pci_power_manageable(pci_dev)) pci_prepare_to_sleep(pci_dev); } pci_dbg(pci_dev, "PCI PM: Suspend power state: %s\n", pci_power_name(pci_dev->current_state)); if (pci_dev->current_state == PCI_D0) { pci_dev->skip_bus_pm = true; /* * Per PCI PM r1.2, table 6-1, a bridge must be in D0 if any * downstream device is in D0, so avoid changing the power state * of the parent bridge by setting the skip_bus_pm flag for it. */ if (pci_dev->bus->self) pci_dev->bus->self->skip_bus_pm = true; } if (pci_dev->skip_bus_pm && pm_suspend_no_platform()) { pci_dbg(pci_dev, "PCI PM: Skipped\n"); goto Fixup; } pci_pm_set_unknown_state(pci_dev); /* * Some BIOSes from ASUS have a bug: If a USB EHCI host controller's * PCI COMMAND register isn't 0, the BIOS assumes that the controller * hasn't been quiesced and tries to turn it off. If the controller * is already in D3, this can hang or cause memory corruption. * * Since the value of the COMMAND register doesn't matter once the * device has been suspended, we can safely set it to 0 here. */ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) pci_write_config_word(pci_dev, PCI_COMMAND, 0); Fixup: pci_fixup_device(pci_fixup_suspend_late, pci_dev); /* * If the target system sleep state is suspend-to-idle, it is sufficient * to check whether or not the device's wakeup settings are good for * runtime PM. Otherwise, the pm_resume_via_firmware() check will cause * pci_pm_complete() to take care of fixing up the device's state * anyway, if need be. */ if (device_can_wakeup(dev) && !device_may_wakeup(dev)) dev->power.may_skip_resume = false; return 0; } static int pci_pm_resume_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev_state = pci_dev->current_state; bool skip_bus_pm = pci_dev->skip_bus_pm; if (dev_pm_skip_resume(dev)) return 0; /* * In the suspend-to-idle case, devices left in D0 during suspend will * stay in D0, so it is not necessary to restore or update their * configuration here and attempting to put them into D0 again is * pointless, so avoid doing that. */ if (!(skip_bus_pm && pm_suspend_no_platform())) pci_pm_default_resume_early(pci_dev); pci_fixup_device(pci_fixup_resume_early, pci_dev); pcie_pme_root_status_cleanup(pci_dev); if (!skip_bus_pm && prev_state == PCI_D3cold) pci_pm_bridge_power_up_actions(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->resume_noirq) return pm->resume_noirq(dev); return 0; } static int pci_pm_resume_early(struct device *dev) { if (dev_pm_skip_resume(dev)) return 0; return pm_generic_resume_early(dev); } static int pci_pm_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * This is necessary for the suspend error path in which resume is * called without restoring the standard config registers of the device. */ if (pci_dev->state_saved) pci_restore_standard_config(pci_dev); pci_resume_ptm(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); pci_pm_default_resume(pci_dev); if (pm) { if (pm->resume) return pm->resume(dev); } else { pci_pm_reenable_device(pci_dev); } return 0; } #else /* !CONFIG_SUSPEND */ #define pci_pm_suspend NULL #define pci_pm_suspend_late NULL #define pci_pm_suspend_noirq NULL #define pci_pm_resume NULL #define pci_pm_resume_early NULL #define pci_pm_resume_noirq NULL #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS static int pci_pm_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_FREEZE); if (!pm) { pci_pm_default_suspend(pci_dev); return 0; } /* * Resume all runtime-suspended devices before creating a snapshot * image of system memory, because the restore kernel generally cannot * be expected to always handle them consistently and they need to be * put into the runtime-active metastate during system resume anyway, * so it is better to ensure that the state saved in the image will be * always consistent with that. */ pm_runtime_resume(dev); pci_dev->state_saved = false; if (pm->freeze) { int error; error = pm->freeze(dev); suspend_report_result(dev, pm->freeze, error); if (error) return error; } return 0; } static int pci_pm_freeze_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (pm && pm->freeze_noirq) { int error; error = pm->freeze_noirq(dev); suspend_report_result(dev, pm->freeze_noirq, error); if (error) return error; } if (!pci_dev->state_saved) pci_save_state(pci_dev); pci_pm_set_unknown_state(pci_dev); return 0; } static int pci_pm_thaw_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * The pm->thaw_noirq() callback assumes the device has been * returned to D0 and its config state has been restored. * * In addition, pci_restore_state() restores MSI-X state in MMIO * space, which requires the device to be in D0, so return it to D0 * in case the driver's "freeze" callbacks put it into a low-power * state. */ pci_pm_power_up_and_verify_state(pci_dev); pci_restore_state(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->thaw_noirq) return pm->thaw_noirq(dev); return 0; } static int pci_pm_thaw(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int error = 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); if (pm) { if (pm->thaw) error = pm->thaw(dev); } else { pci_pm_reenable_device(pci_dev); } pci_dev->state_saved = false; return error; } static int pci_pm_poweroff(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend(dev, PMSG_HIBERNATE); if (!pm) { pci_pm_default_suspend(pci_dev); return 0; } /* The reason to do that is the same as in pci_pm_suspend(). */ if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) || pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; } else { pci_dev_adjust_pme(pci_dev); } if (pm->poweroff) { int error; error = pm->poweroff(dev); suspend_report_result(dev, pm->poweroff, error); if (error) return error; } return 0; } static int pci_pm_poweroff_late(struct device *dev) { if (dev_pm_skip_suspend(dev)) return 0; pci_fixup_device(pci_fixup_suspend, to_pci_dev(dev)); return pm_generic_poweroff_late(dev); } static int pci_pm_poweroff_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; if (dev_pm_skip_suspend(dev)) return 0; if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_suspend_late(dev); if (!pm) { pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } if (pm->poweroff_noirq) { int error; error = pm->poweroff_noirq(dev); suspend_report_result(dev, pm->poweroff_noirq, error); if (error) return error; } if (!pci_dev->state_saved && !pci_has_subordinate(pci_dev)) pci_prepare_to_sleep(pci_dev); /* * The reason for doing this here is the same as for the analogous code * in pci_pm_suspend_noirq(). */ if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) pci_write_config_word(pci_dev, PCI_COMMAND, 0); pci_fixup_device(pci_fixup_suspend_late, pci_dev); return 0; } static int pci_pm_restore_noirq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_pm_default_resume_early(pci_dev); pci_fixup_device(pci_fixup_resume_early, pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return 0; if (pm && pm->restore_noirq) return pm->restore_noirq(dev); return 0; } static int pci_pm_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * This is necessary for the hibernation error path in which restore is * called without restoring the standard config registers of the device. */ if (pci_dev->state_saved) pci_restore_standard_config(pci_dev); if (pci_has_legacy_pm_support(pci_dev)) return pci_legacy_resume(dev); pci_pm_default_resume(pci_dev); if (pm) { if (pm->restore) return pm->restore(dev); } else { pci_pm_reenable_device(pci_dev); } return 0; } #else /* !CONFIG_HIBERNATE_CALLBACKS */ #define pci_pm_freeze NULL #define pci_pm_freeze_noirq NULL #define pci_pm_thaw NULL #define pci_pm_thaw_noirq NULL #define pci_pm_poweroff NULL #define pci_pm_poweroff_late NULL #define pci_pm_poweroff_noirq NULL #define pci_pm_restore NULL #define pci_pm_restore_noirq NULL #endif /* !CONFIG_HIBERNATE_CALLBACKS */ #ifdef CONFIG_PM static int pci_pm_runtime_suspend(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev = pci_dev->current_state; int error; pci_suspend_ptm(pci_dev); /* * If pci_dev->driver is not set (unbound), we leave the device in D0, * but it may go to D3cold when the bridge above it runtime suspends. * Save its config space in case that happens. */ if (!pci_dev->driver) { pci_save_state(pci_dev); return 0; } pci_dev->state_saved = false; if (pm && pm->runtime_suspend) { error = pm->runtime_suspend(dev); /* * -EBUSY and -EAGAIN is used to request the runtime PM core * to schedule a new suspend, so log the event only with debug * log level. */ if (error == -EBUSY || error == -EAGAIN) { pci_dbg(pci_dev, "can't suspend now (%ps returned %d)\n", pm->runtime_suspend, error); return error; } else if (error) { pci_err(pci_dev, "can't suspend (%ps returned %d)\n", pm->runtime_suspend, error); return error; } } pci_fixup_device(pci_fixup_suspend, pci_dev); if (pm && pm->runtime_suspend && !pci_dev->state_saved && pci_dev->current_state != PCI_D0 && pci_dev->current_state != PCI_UNKNOWN) { pci_WARN_ONCE(pci_dev, pci_dev->current_state != prev, "PCI PM: State of device not saved by %pS\n", pm->runtime_suspend); return 0; } if (!pci_dev->state_saved) { pci_save_state(pci_dev); pci_finish_runtime_suspend(pci_dev); } return 0; } static int pci_pm_runtime_resume(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; pci_power_t prev_state = pci_dev->current_state; int error = 0; /* * Restoring config space is necessary even if the device is not bound * to a driver because although we left it in D0, it may have gone to * D3cold when the bridge above it runtime suspended. */ pci_pm_default_resume_early(pci_dev); pci_resume_ptm(pci_dev); if (!pci_dev->driver) return 0; pci_fixup_device(pci_fixup_resume_early, pci_dev); pci_pm_default_resume(pci_dev); if (prev_state == PCI_D3cold) pci_pm_bridge_power_up_actions(pci_dev); if (pm && pm->runtime_resume) error = pm->runtime_resume(dev); return error; } static int pci_pm_runtime_idle(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; /* * If pci_dev->driver is not set (unbound), the device should * always remain in D0 regardless of the runtime PM status */ if (!pci_dev->driver) return 0; if (pm && pm->runtime_idle) return pm->runtime_idle(dev); return 0; } static const struct dev_pm_ops pci_dev_pm_ops = { .prepare = pci_pm_prepare, .complete = pci_pm_complete, .suspend = pci_pm_suspend, .suspend_late = pci_pm_suspend_late, .resume = pci_pm_resume, .resume_early = pci_pm_resume_early, .freeze = pci_pm_freeze, .thaw = pci_pm_thaw, .poweroff = pci_pm_poweroff, .poweroff_late = pci_pm_poweroff_late, .restore = pci_pm_restore, .suspend_noirq = pci_pm_suspend_noirq, .resume_noirq = pci_pm_resume_noirq, .freeze_noirq = pci_pm_freeze_noirq, .thaw_noirq = pci_pm_thaw_noirq, .poweroff_noirq = pci_pm_poweroff_noirq, .restore_noirq = pci_pm_restore_noirq, .runtime_suspend = pci_pm_runtime_suspend, .runtime_resume = pci_pm_runtime_resume, .runtime_idle = pci_pm_runtime_idle, }; #define PCI_PM_OPS_PTR (&pci_dev_pm_ops) #else /* !CONFIG_PM */ #define pci_pm_runtime_suspend NULL #define pci_pm_runtime_resume NULL #define pci_pm_runtime_idle NULL #define PCI_PM_OPS_PTR NULL #endif /* !CONFIG_PM */ /** * __pci_register_driver - register a new pci driver * @drv: the driver structure to register * @owner: owner module of drv * @mod_name: module name string * * Adds the driver structure to the list of registered drivers. * Returns a negative value on error, otherwise 0. * If no error occurred, the driver remains registered even if * no device was claimed during registration. */ int __pci_register_driver(struct pci_driver *drv, struct module *owner, const char *mod_name) { /* initialize common driver fields */ drv->driver.name = drv->name; drv->driver.bus = &pci_bus_type; drv->driver.owner = owner; drv->driver.mod_name = mod_name; drv->driver.groups = drv->groups; drv->driver.dev_groups = drv->dev_groups; spin_lock_init(&drv->dynids.lock); INIT_LIST_HEAD(&drv->dynids.list); /* register with core */ return driver_register(&drv->driver); } EXPORT_SYMBOL(__pci_register_driver); /** * pci_unregister_driver - unregister a pci driver * @drv: the driver structure to unregister * * Deletes the driver structure from the list of registered PCI drivers, * gives it a chance to clean up by calling its remove() function for * each device it was responsible for, and marks those devices as * driverless. */ void pci_unregister_driver(struct pci_driver *drv) { driver_unregister(&drv->driver); pci_free_dynids(drv); } EXPORT_SYMBOL(pci_unregister_driver); static struct pci_driver pci_compat_driver = { .name = "compat" }; /** * pci_dev_driver - get the pci_driver of a device * @dev: the device to query * * Returns the appropriate pci_driver structure or %NULL if there is no * registered driver for the device. */ struct pci_driver *pci_dev_driver(const struct pci_dev *dev) { int i; if (dev->driver) return dev->driver; for (i = 0; i <= PCI_ROM_RESOURCE; i++) if (dev->resource[i].flags & IORESOURCE_BUSY) return &pci_compat_driver; return NULL; } EXPORT_SYMBOL(pci_dev_driver); /** * pci_bus_match - Tell if a PCI device structure has a matching PCI device id structure * @dev: the PCI device structure to match against * @drv: the device driver to search for matching PCI device id structures * * Used by a driver to check whether a PCI device present in the * system is in its list of supported devices. Returns the matching * pci_device_id structure or %NULL if there is no match. */ static int pci_bus_match(struct device *dev, const struct device_driver *drv) { struct pci_dev *pci_dev = to_pci_dev(dev); struct pci_driver *pci_drv; const struct pci_device_id *found_id; if (!pci_dev->match_driver) return 0; pci_drv = (struct pci_driver *)to_pci_driver(drv); found_id = pci_match_device(pci_drv, pci_dev); if (found_id) return 1; return 0; } /** * pci_dev_get - increments the reference count of the pci device structure * @dev: the device being referenced * * Each live reference to a device should be refcounted. * * Drivers for PCI devices should normally record such references in * their probe() methods, when they bind to a device, and release * them by calling pci_dev_put(), in their disconnect() methods. * * A pointer to the device with the incremented reference counter is returned. */ struct pci_dev *pci_dev_get(struct pci_dev *dev) { if (dev) get_device(&dev->dev); return dev; } EXPORT_SYMBOL(pci_dev_get); /** * pci_dev_put - release a use of the pci device structure * @dev: device that's been disconnected * * Must be called when a user of a device is finished with it. When the last * user of the device calls this function, the memory of the device is freed. */ void pci_dev_put(struct pci_dev *dev) { if (dev) put_device(&dev->dev); } EXPORT_SYMBOL(pci_dev_put); static int pci_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct pci_dev *pdev; if (!dev) return -ENODEV; pdev = to_pci_dev(dev); if (add_uevent_var(env, "PCI_CLASS=%04X", pdev->class)) return -ENOMEM; if (add_uevent_var(env, "PCI_ID=%04X:%04X", pdev->vendor, pdev->device)) return -ENOMEM; if (add_uevent_var(env, "PCI_SUBSYS_ID=%04X:%04X", pdev->subsystem_vendor, pdev->subsystem_device)) return -ENOMEM; if (add_uevent_var(env, "PCI_SLOT_NAME=%s", pci_name(pdev))) return -ENOMEM; if (add_uevent_var(env, "MODALIAS=pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X", pdev->vendor, pdev->device, pdev->subsystem_vendor, pdev->subsystem_device, (u8)(pdev->class >> 16), (u8)(pdev->class >> 8), (u8)(pdev->class))) return -ENOMEM; return 0; } #if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) /** * pci_uevent_ers - emit a uevent during recovery path of PCI device * @pdev: PCI device undergoing error recovery * @err_type: type of error event */ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type) { int idx = 0; char *envp[3]; switch (err_type) { case PCI_ERS_RESULT_NONE: case PCI_ERS_RESULT_CAN_RECOVER: envp[idx++] = "ERROR_EVENT=BEGIN_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=0"; break; case PCI_ERS_RESULT_RECOVERED: envp[idx++] = "ERROR_EVENT=SUCCESSFUL_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=1"; break; case PCI_ERS_RESULT_DISCONNECT: envp[idx++] = "ERROR_EVENT=FAILED_RECOVERY"; envp[idx++] = "DEVICE_ONLINE=0"; break; default: break; } if (idx > 0) { envp[idx++] = NULL; kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, envp); } } #endif static int pci_bus_num_vf(struct device *dev) { return pci_num_vf(to_pci_dev(dev)); } /** * pci_dma_configure - Setup DMA configuration * @dev: ptr to dev structure * * Function to update PCI devices's DMA configuration using the same * info from the OF node or ACPI node of host bridge's parent (if any). */ static int pci_dma_configure(struct device *dev) { struct pci_driver *driver = to_pci_driver(dev->driver); struct device *bridge; int ret = 0; bridge = pci_get_host_bridge_device(to_pci_dev(dev)); if (IS_ENABLED(CONFIG_OF) && bridge->parent && bridge->parent->of_node) { ret = of_dma_configure(dev, bridge->parent->of_node, true); } else if (has_acpi_companion(bridge)) { struct acpi_device *adev = to_acpi_device_node(bridge->fwnode); ret = acpi_dma_configure(dev, acpi_get_dma_attr(adev)); } pci_put_host_bridge_device(bridge); if (!ret && !driver->driver_managed_dma) { ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); } return ret; } static void pci_dma_cleanup(struct device *dev) { struct pci_driver *driver = to_pci_driver(dev->driver); if (!driver->driver_managed_dma) iommu_device_unuse_default_domain(dev); } struct bus_type pci_bus_type = { .name = "pci", .match = pci_bus_match, .uevent = pci_uevent, .probe = pci_device_probe, .remove = pci_device_remove, .shutdown = pci_device_shutdown, .dev_groups = pci_dev_groups, .bus_groups = pci_bus_groups, .drv_groups = pci_drv_groups, .pm = PCI_PM_OPS_PTR, .num_vf = pci_bus_num_vf, .dma_configure = pci_dma_configure, .dma_cleanup = pci_dma_cleanup, }; EXPORT_SYMBOL(pci_bus_type); #ifdef CONFIG_PCIEPORTBUS static int pcie_port_bus_match(struct device *dev, const struct device_driver *drv) { struct pcie_device *pciedev; const struct pcie_port_service_driver *driver; if (drv->bus != &pcie_port_bus_type || dev->bus != &pcie_port_bus_type) return 0; pciedev = to_pcie_device(dev); driver = to_service_driver(drv); if (driver->service != pciedev->service) return 0; if (driver->port_type != PCIE_ANY_PORT && driver->port_type != pci_pcie_type(pciedev->port)) return 0; return 1; } const struct bus_type pcie_port_bus_type = { .name = "pci_express", .match = pcie_port_bus_match, }; #endif static int __init pci_driver_init(void) { int ret; ret = bus_register(&pci_bus_type); if (ret) return ret; #ifdef CONFIG_PCIEPORTBUS ret = bus_register(&pcie_port_bus_type); if (ret) return ret; #endif dma_debug_add_bus(&pci_bus_type); return 0; } postcore_initcall(pci_driver_init); |
| 9 9 1 3 3 3 1 1 1 1 1 1 1 4 1 1 5 5 1 4 4 1 1 1 1 1 3 3 3 3 2 2 2 1 1 1 2 1 1 2 4 4 4 4 4 1 1 1 3 3 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 2 2 1 1 2 2 2 1 1 1 1 2 2 2 2 2 1 3 3 1 1 1 1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 | // SPDX-License-Identifier: GPL-2.0-only /* binder.c * * Android IPC Subsystem * * Copyright (C) 2007-2008 Google, Inc. */ /* * Locking overview * * There are 3 main spinlocks which must be acquired in the * order shown: * * 1) proc->outer_lock : protects binder_ref * binder_proc_lock() and binder_proc_unlock() are * used to acq/rel. * 2) node->lock : protects most fields of binder_node. * binder_node_lock() and binder_node_unlock() are * used to acq/rel * 3) proc->inner_lock : protects the thread and node lists * (proc->threads, proc->waiting_threads, proc->nodes) * and all todo lists associated with the binder_proc * (proc->todo, thread->todo, proc->delivered_death and * node->async_todo), as well as thread->transaction_stack * binder_inner_proc_lock() and binder_inner_proc_unlock() * are used to acq/rel * * Any lock under procA must never be nested under any lock at the same * level or below on procB. * * Functions that require a lock held on entry indicate which lock * in the suffix of the function name: * * foo_olocked() : requires node->outer_lock * foo_nlocked() : requires node->lock * foo_ilocked() : requires proc->inner_lock * foo_oilocked(): requires proc->outer_lock and proc->inner_lock * foo_nilocked(): requires node->lock and proc->inner_lock * ... */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fdtable.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/nsproxy.h> #include <linux/poll.h> #include <linux/debugfs.h> #include <linux/rbtree.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/pid_namespace.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/ratelimit.h> #include <linux/syscalls.h> #include <linux/task_work.h> #include <linux/sizes.h> #include <linux/ktime.h> #include <uapi/linux/android/binder.h> #include <linux/cacheflush.h> #include "binder_internal.h" #include "binder_trace.h" static HLIST_HEAD(binder_deferred_list); static DEFINE_MUTEX(binder_deferred_lock); static HLIST_HEAD(binder_devices); static HLIST_HEAD(binder_procs); static DEFINE_MUTEX(binder_procs_lock); static HLIST_HEAD(binder_dead_nodes); static DEFINE_SPINLOCK(binder_dead_nodes_lock); static struct dentry *binder_debugfs_dir_entry_root; static struct dentry *binder_debugfs_dir_entry_proc; static atomic_t binder_last_id; static int proc_show(struct seq_file *m, void *unused); DEFINE_SHOW_ATTRIBUTE(proc); #define FORBIDDEN_MMAP_FLAGS (VM_WRITE) enum { BINDER_DEBUG_USER_ERROR = 1U << 0, BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, BINDER_DEBUG_DEAD_TRANSACTION = 1U << 2, BINDER_DEBUG_OPEN_CLOSE = 1U << 3, BINDER_DEBUG_DEAD_BINDER = 1U << 4, BINDER_DEBUG_DEATH_NOTIFICATION = 1U << 5, BINDER_DEBUG_READ_WRITE = 1U << 6, BINDER_DEBUG_USER_REFS = 1U << 7, BINDER_DEBUG_THREADS = 1U << 8, BINDER_DEBUG_TRANSACTION = 1U << 9, BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10, BINDER_DEBUG_FREE_BUFFER = 1U << 11, BINDER_DEBUG_INTERNAL_REFS = 1U << 12, BINDER_DEBUG_PRIORITY_CAP = 1U << 13, BINDER_DEBUG_SPINLOCKS = 1U << 14, }; static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; module_param_named(debug_mask, binder_debug_mask, uint, 0644); char *binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; module_param_named(devices, binder_devices_param, charp, 0444); static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); static int binder_stop_on_user_error; static int binder_set_stop_on_user_error(const char *val, const struct kernel_param *kp) { int ret; ret = param_set_int(val, kp); if (binder_stop_on_user_error < 2) wake_up(&binder_user_error_wait); return ret; } module_param_call(stop_on_user_error, binder_set_stop_on_user_error, param_get_int, &binder_stop_on_user_error, 0644); static __printf(2, 3) void binder_debug(int mask, const char *format, ...) { struct va_format vaf; va_list args; if (binder_debug_mask & mask) { va_start(args, format); vaf.va = &args; vaf.fmt = format; pr_info_ratelimited("%pV", &vaf); va_end(args); } } #define binder_txn_error(x...) \ binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, x) static __printf(1, 2) void binder_user_error(const char *format, ...) { struct va_format vaf; va_list args; if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) { va_start(args, format); vaf.va = &args; vaf.fmt = format; pr_info_ratelimited("%pV", &vaf); va_end(args); } if (binder_stop_on_user_error) binder_stop_on_user_error = 2; } #define binder_set_extended_error(ee, _id, _command, _param) \ do { \ (ee)->id = _id; \ (ee)->command = _command; \ (ee)->param = _param; \ } while (0) #define to_flat_binder_object(hdr) \ container_of(hdr, struct flat_binder_object, hdr) #define to_binder_fd_object(hdr) container_of(hdr, struct binder_fd_object, hdr) #define to_binder_buffer_object(hdr) \ container_of(hdr, struct binder_buffer_object, hdr) #define to_binder_fd_array_object(hdr) \ container_of(hdr, struct binder_fd_array_object, hdr) static struct binder_stats binder_stats; static inline void binder_stats_deleted(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_deleted[type]); } static inline void binder_stats_created(enum binder_stat_types type) { atomic_inc(&binder_stats.obj_created[type]); } struct binder_transaction_log_entry { int debug_id; int debug_id_done; int call_type; int from_proc; int from_thread; int target_handle; int to_proc; int to_thread; int to_node; int data_size; int offsets_size; int return_error_line; uint32_t return_error; uint32_t return_error_param; char context_name[BINDERFS_MAX_NAME + 1]; }; struct binder_transaction_log { atomic_t cur; bool full; struct binder_transaction_log_entry entry[32]; }; static struct binder_transaction_log binder_transaction_log; static struct binder_transaction_log binder_transaction_log_failed; static struct binder_transaction_log_entry *binder_transaction_log_add( struct binder_transaction_log *log) { struct binder_transaction_log_entry *e; unsigned int cur = atomic_inc_return(&log->cur); if (cur >= ARRAY_SIZE(log->entry)) log->full = true; e = &log->entry[cur % ARRAY_SIZE(log->entry)]; WRITE_ONCE(e->debug_id_done, 0); /* * write-barrier to synchronize access to e->debug_id_done. * We make sure the initialized 0 value is seen before * memset() other fields are zeroed by memset. */ smp_wmb(); memset(e, 0, sizeof(*e)); return e; } enum binder_deferred_state { BINDER_DEFERRED_FLUSH = 0x01, BINDER_DEFERRED_RELEASE = 0x02, }; enum { BINDER_LOOPER_STATE_REGISTERED = 0x01, BINDER_LOOPER_STATE_ENTERED = 0x02, BINDER_LOOPER_STATE_EXITED = 0x04, BINDER_LOOPER_STATE_INVALID = 0x08, BINDER_LOOPER_STATE_WAITING = 0x10, BINDER_LOOPER_STATE_POLL = 0x20, }; /** * binder_proc_lock() - Acquire outer lock for given binder_proc * @proc: struct binder_proc to acquire * * Acquires proc->outer_lock. Used to protect binder_ref * structures associated with the given proc. */ #define binder_proc_lock(proc) _binder_proc_lock(proc, __LINE__) static void _binder_proc_lock(struct binder_proc *proc, int line) __acquires(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&proc->outer_lock); } /** * binder_proc_unlock() - Release spinlock for given binder_proc * @proc: struct binder_proc to acquire * * Release lock acquired via binder_proc_lock() */ #define binder_proc_unlock(proc) _binder_proc_unlock(proc, __LINE__) static void _binder_proc_unlock(struct binder_proc *proc, int line) __releases(&proc->outer_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&proc->outer_lock); } /** * binder_inner_proc_lock() - Acquire inner lock for given binder_proc * @proc: struct binder_proc to acquire * * Acquires proc->inner_lock. Used to protect todo lists */ #define binder_inner_proc_lock(proc) _binder_inner_proc_lock(proc, __LINE__) static void _binder_inner_proc_lock(struct binder_proc *proc, int line) __acquires(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&proc->inner_lock); } /** * binder_inner_proc_unlock() - Release inner lock for given binder_proc * @proc: struct binder_proc to acquire * * Release lock acquired via binder_inner_proc_lock() */ #define binder_inner_proc_unlock(proc) _binder_inner_proc_unlock(proc, __LINE__) static void _binder_inner_proc_unlock(struct binder_proc *proc, int line) __releases(&proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&proc->inner_lock); } /** * binder_node_lock() - Acquire spinlock for given binder_node * @node: struct binder_node to acquire * * Acquires node->lock. Used to protect binder_node fields */ #define binder_node_lock(node) _binder_node_lock(node, __LINE__) static void _binder_node_lock(struct binder_node *node, int line) __acquires(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); } /** * binder_node_unlock() - Release spinlock for given binder_proc * @node: struct binder_node to acquire * * Release lock acquired via binder_node_lock() */ #define binder_node_unlock(node) _binder_node_unlock(node, __LINE__) static void _binder_node_unlock(struct binder_node *node, int line) __releases(&node->lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_unlock(&node->lock); } /** * binder_node_inner_lock() - Acquire node and inner locks * @node: struct binder_node to acquire * * Acquires node->lock. If node->proc also acquires * proc->inner_lock. Used to protect binder_node fields */ #define binder_node_inner_lock(node) _binder_node_inner_lock(node, __LINE__) static void _binder_node_inner_lock(struct binder_node *node, int line) __acquires(&node->lock) __acquires(&node->proc->inner_lock) { binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); spin_lock(&node->lock); if (node->proc) binder_inner_proc_lock(node->proc); else /* annotation for sparse */ __acquire(&node->proc->inner_lock); } /** * binder_node_inner_unlock() - Release node and inner locks * @node: struct binder_node to acquire * * Release lock acquired via binder_node_lock() */ #define binder_node_inner_unlock(node) _binder_node_inner_unlock(node, __LINE__) static void _binder_node_inner_unlock(struct binder_node *node, int line) __releases(&node->lock) __releases(&node->proc->inner_lock) { struct binder_proc *proc = node->proc; binder_debug(BINDER_DEBUG_SPINLOCKS, "%s: line=%d\n", __func__, line); if (proc) binder_inner_proc_unlock(proc); else /* annotation for sparse */ __release(&node->proc->inner_lock); spin_unlock(&node->lock); } static bool binder_worklist_empty_ilocked(struct list_head *list) { return list_empty(list); } /** * binder_worklist_empty() - Check if no items on the work list * @proc: binder_proc associated with list * @list: list to check * * Return: true if there are no items on list, else false */ static bool binder_worklist_empty(struct binder_proc *proc, struct list_head *list) { bool ret; binder_inner_proc_lock(proc); ret = binder_worklist_empty_ilocked(list); binder_inner_proc_unlock(proc); return ret; } /** * binder_enqueue_work_ilocked() - Add an item to the work list * @work: struct binder_work to add to list * @target_list: list to add work to * * Adds the work to the specified list. Asserts that work * is not already on a list. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_work_ilocked(struct binder_work *work, struct list_head *target_list) { BUG_ON(target_list == NULL); BUG_ON(work->entry.next && !list_empty(&work->entry)); list_add_tail(&work->entry, target_list); } /** * binder_enqueue_deferred_thread_work_ilocked() - Add deferred thread work * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread. Doesn't set the process_todo * flag, which means that (if it wasn't already set) the thread will go to * sleep without handling this work when it calls read. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); } /** * binder_enqueue_thread_work_ilocked() - Add an item to the thread work list * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread, and enables processing * of the todo queue. * * Requires the proc->inner_lock to be held. */ static void binder_enqueue_thread_work_ilocked(struct binder_thread *thread, struct binder_work *work) { WARN_ON(!list_empty(&thread->waiting_thread_node)); binder_enqueue_work_ilocked(work, &thread->todo); /* (e)poll-based threads require an explicit wakeup signal when * queuing their own work; they rely on these events to consume * messages without I/O block. Without it, threads risk waiting * indefinitely without handling the work. */ if (thread->looper & BINDER_LOOPER_STATE_POLL && thread->pid == current->pid && !thread->process_todo) wake_up_interruptible_sync(&thread->wait); thread->process_todo = true; } /** * binder_enqueue_thread_work() - Add an item to the thread work list * @thread: thread to queue work to * @work: struct binder_work to add to list * * Adds the work to the todo list of the thread, and enables processing * of the todo queue. */ static void binder_enqueue_thread_work(struct binder_thread *thread, struct binder_work *work) { binder_inner_proc_lock(thread->proc); binder_enqueue_thread_work_ilocked(thread, work); binder_inner_proc_unlock(thread->proc); } static void binder_dequeue_work_ilocked(struct binder_work *work) { list_del_init(&work->entry); } /** * binder_dequeue_work() - Removes an item from the work list * @proc: binder_proc associated with list * @work: struct binder_work to remove from list * * Removes the specified work item from whatever list it is on. * Can safely be called if work is not on any list. */ static void binder_dequeue_work(struct binder_proc *proc, struct binder_work *work) { binder_inner_proc_lock(proc); binder_dequeue_work_ilocked(work); binder_inner_proc_unlock(proc); } static struct binder_work *binder_dequeue_work_head_ilocked( struct list_head *list) { struct binder_work *w; w = list_first_entry_or_null(list, struct binder_work, entry); if (w) list_del_init(&w->entry); return w; } static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); static void binder_free_thread(struct binder_thread *thread); static void binder_free_proc(struct binder_proc *proc); static void binder_inc_node_tmpref_ilocked(struct binder_node *node); static bool binder_has_work_ilocked(struct binder_thread *thread, bool do_proc_work) { return thread->process_todo || thread->looper_need_return || (do_proc_work && !binder_worklist_empty_ilocked(&thread->proc->todo)); } static bool binder_has_work(struct binder_thread *thread, bool do_proc_work) { bool has_work; binder_inner_proc_lock(thread->proc); has_work = binder_has_work_ilocked(thread, do_proc_work); binder_inner_proc_unlock(thread->proc); return has_work; } static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread) { return !thread->transaction_stack && binder_worklist_empty_ilocked(&thread->todo); } static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc, bool sync) { struct rb_node *n; struct binder_thread *thread; for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { thread = rb_entry(n, struct binder_thread, rb_node); if (thread->looper & BINDER_LOOPER_STATE_POLL && binder_available_for_proc_work_ilocked(thread)) { if (sync) wake_up_interruptible_sync(&thread->wait); else wake_up_interruptible(&thread->wait); } } } /** * binder_select_thread_ilocked() - selects a thread for doing proc work. * @proc: process to select a thread from * * Note that calling this function moves the thread off the waiting_threads * list, so it can only be woken up by the caller of this function, or a * signal. Therefore, callers *should* always wake up the thread this function * returns. * * Return: If there's a thread currently waiting for process work, * returns that thread. Otherwise returns NULL. */ static struct binder_thread * binder_select_thread_ilocked(struct binder_proc *proc) { struct binder_thread *thread; assert_spin_locked(&proc->inner_lock); thread = list_first_entry_or_null(&proc->waiting_threads, struct binder_thread, waiting_thread_node); if (thread) list_del_init(&thread->waiting_thread_node); return thread; } /** * binder_wakeup_thread_ilocked() - wakes up a thread for doing proc work. * @proc: process to wake up a thread in * @thread: specific thread to wake-up (may be NULL) * @sync: whether to do a synchronous wake-up * * This function wakes up a thread in the @proc process. * The caller may provide a specific thread to wake-up in * the @thread parameter. If @thread is NULL, this function * will wake up threads that have called poll(). * * Note that for this function to work as expected, callers * should first call binder_select_thread() to find a thread * to handle the work (if they don't have a thread already), * and pass the result into the @thread parameter. */ static void binder_wakeup_thread_ilocked(struct binder_proc *proc, struct binder_thread *thread, bool sync) { assert_spin_locked(&proc->inner_lock); if (thread) { if (sync) wake_up_interruptible_sync(&thread->wait); else wake_up_interruptible(&thread->wait); return; } /* Didn't find a thread waiting for proc work; this can happen * in two scenarios: * 1. All threads are busy handling transactions * In that case, one of those threads should call back into * the kernel driver soon and pick up this work. * 2. Threads are using the (e)poll interface, in which case * they may be blocked on the waitqueue without having been * added to waiting_threads. For this case, we just iterate * over all threads not handling transaction work, and * wake them all up. We wake all because we don't know whether * a thread that called into (e)poll is handling non-binder * work currently. */ binder_wakeup_poll_threads_ilocked(proc, sync); } static void binder_wakeup_proc_ilocked(struct binder_proc *proc) { struct binder_thread *thread = binder_select_thread_ilocked(proc); binder_wakeup_thread_ilocked(proc, thread, /* sync = */false); } static void binder_set_nice(long nice) { long min_nice; if (can_nice(current, nice)) { set_user_nice(current, nice); return; } min_nice = rlimit_to_nice(rlimit(RLIMIT_NICE)); binder_debug(BINDER_DEBUG_PRIORITY_CAP, "%d: nice value %ld not allowed use %ld instead\n", current->pid, nice, min_nice); set_user_nice(current, min_nice); if (min_nice <= MAX_NICE) return; binder_user_error("%d RLIMIT_NICE not set\n", current->pid); } static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc, binder_uintptr_t ptr) { struct rb_node *n = proc->nodes.rb_node; struct binder_node *node; assert_spin_locked(&proc->inner_lock); while (n) { node = rb_entry(n, struct binder_node, rb_node); if (ptr < node->ptr) n = n->rb_left; else if (ptr > node->ptr) n = n->rb_right; else { /* * take an implicit weak reference * to ensure node stays alive until * call to binder_put_node() */ binder_inc_node_tmpref_ilocked(node); return node; } } return NULL; } static struct binder_node *binder_get_node(struct binder_proc *proc, binder_uintptr_t ptr) { struct binder_node *node; binder_inner_proc_lock(proc); node = binder_get_node_ilocked(proc, ptr); binder_inner_proc_unlock(proc); return node; } static struct binder_node *binder_init_node_ilocked( struct binder_proc *proc, struct binder_node *new_node, struct flat_binder_object *fp) { struct rb_node **p = &proc->nodes.rb_node; struct rb_node *parent = NULL; struct binder_node *node; binder_uintptr_t ptr = fp ? fp->binder : 0; binder_uintptr_t cookie = fp ? fp->cookie : 0; __u32 flags = fp ? fp->flags : 0; assert_spin_locked(&proc->inner_lock); while (*p) { parent = *p; node = rb_entry(parent, struct binder_node, rb_node); if (ptr < node->ptr) p = &(*p)->rb_left; else if (ptr > node->ptr) p = &(*p)->rb_right; else { /* * A matching node is already in * the rb tree. Abandon the init * and return it. */ binder_inc_node_tmpref_ilocked(node); return node; } } node = new_node; binder_stats_created(BINDER_STAT_NODE); node->tmp_refs++; rb_link_node(&node->rb_node, parent, p); rb_insert_color(&node->rb_node, &proc->nodes); node->debug_id = atomic_inc_return(&binder_last_id); node->proc = proc; node->ptr = ptr; node->cookie = cookie; node->work.type = BINDER_WORK_NODE; node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx created\n", proc->pid, current->pid, node->debug_id, (u64)node->ptr, (u64)node->cookie); return node; } static struct binder_node *binder_new_node(struct binder_proc *proc, struct flat_binder_object *fp) { struct binder_node *node; struct binder_node *new_node = kzalloc(sizeof(*node), GFP_KERNEL); if (!new_node) return NULL; binder_inner_proc_lock(proc); node = binder_init_node_ilocked(proc, new_node, fp); binder_inner_proc_unlock(proc); if (node != new_node) /* * The node was already added by another thread */ kfree(new_node); return node; } static void binder_free_node(struct binder_node *node) { kfree(node); binder_stats_deleted(BINDER_STAT_NODE); } static int binder_inc_node_nilocked(struct binder_node *node, int strong, int internal, struct list_head *target_list) { struct binder_proc *proc = node->proc; assert_spin_locked(&node->lock); if (proc) assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) { if (target_list == NULL && node->internal_strong_refs == 0 && !(node->proc && node == node->proc->context->binder_context_mgr_node && node->has_strong_ref)) { pr_err("invalid inc strong node for %d\n", node->debug_id); return -EINVAL; } node->internal_strong_refs++; } else node->local_strong_refs++; if (!node->has_strong_ref && target_list) { struct binder_thread *thread = container_of(target_list, struct binder_thread, todo); binder_dequeue_work_ilocked(&node->work); BUG_ON(&thread->todo != target_list); binder_enqueue_deferred_thread_work_ilocked(thread, &node->work); } } else { if (!internal) node->local_weak_refs++; if (!node->has_weak_ref && list_empty(&node->work.entry)) { if (target_list == NULL) { pr_err("invalid inc weak node for %d\n", node->debug_id); return -EINVAL; } /* * See comment above */ binder_enqueue_work_ilocked(&node->work, target_list); } } return 0; } static int binder_inc_node(struct binder_node *node, int strong, int internal, struct list_head *target_list) { int ret; binder_node_inner_lock(node); ret = binder_inc_node_nilocked(node, strong, internal, target_list); binder_node_inner_unlock(node); return ret; } static bool binder_dec_node_nilocked(struct binder_node *node, int strong, int internal) { struct binder_proc *proc = node->proc; assert_spin_locked(&node->lock); if (proc) assert_spin_locked(&proc->inner_lock); if (strong) { if (internal) node->internal_strong_refs--; else node->local_strong_refs--; if (node->local_strong_refs || node->internal_strong_refs) return false; } else { if (!internal) node->local_weak_refs--; if (node->local_weak_refs || node->tmp_refs || !hlist_empty(&node->refs)) return false; } if (proc && (node->has_strong_ref || node->has_weak_ref)) { if (list_empty(&node->work.entry)) { binder_enqueue_work_ilocked(&node->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } } else { if (hlist_empty(&node->refs) && !node->local_strong_refs && !node->local_weak_refs && !node->tmp_refs) { if (proc) { binder_dequeue_work_ilocked(&node->work); rb_erase(&node->rb_node, &proc->nodes); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "refless node %d deleted\n", node->debug_id); } else { BUG_ON(!list_empty(&node->work.entry)); spin_lock(&binder_dead_nodes_lock); /* * tmp_refs could have changed so * check it again */ if (node->tmp_refs) { spin_unlock(&binder_dead_nodes_lock); return false; } hlist_del(&node->dead_node); spin_unlock(&binder_dead_nodes_lock); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "dead node %d deleted\n", node->debug_id); } return true; } } return false; } static void binder_dec_node(struct binder_node *node, int strong, int internal) { bool free_node; binder_node_inner_lock(node); free_node = binder_dec_node_nilocked(node, strong, internal); binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } static void binder_inc_node_tmpref_ilocked(struct binder_node *node) { /* * No call to binder_inc_node() is needed since we * don't need to inform userspace of any changes to * tmp_refs */ node->tmp_refs++; } /** * binder_inc_node_tmpref() - take a temporary reference on node * @node: node to reference * * Take reference on node to prevent the node from being freed * while referenced only by a local variable. The inner lock is * needed to serialize with the node work on the queue (which * isn't needed after the node is dead). If the node is dead * (node->proc is NULL), use binder_dead_nodes_lock to protect * node->tmp_refs against dead-node-only cases where the node * lock cannot be acquired (eg traversing the dead node list to * print nodes) */ static void binder_inc_node_tmpref(struct binder_node *node) { binder_node_lock(node); if (node->proc) binder_inner_proc_lock(node->proc); else spin_lock(&binder_dead_nodes_lock); binder_inc_node_tmpref_ilocked(node); if (node->proc) binder_inner_proc_unlock(node->proc); else spin_unlock(&binder_dead_nodes_lock); binder_node_unlock(node); } /** * binder_dec_node_tmpref() - remove a temporary reference on node * @node: node to reference * * Release temporary reference on node taken via binder_inc_node_tmpref() */ static void binder_dec_node_tmpref(struct binder_node *node) { bool free_node; binder_node_inner_lock(node); if (!node->proc) spin_lock(&binder_dead_nodes_lock); else __acquire(&binder_dead_nodes_lock); node->tmp_refs--; BUG_ON(node->tmp_refs < 0); if (!node->proc) spin_unlock(&binder_dead_nodes_lock); else __release(&binder_dead_nodes_lock); /* * Call binder_dec_node() to check if all refcounts are 0 * and cleanup is needed. Calling with strong=0 and internal=1 * causes no actual reference to be released in binder_dec_node(). * If that changes, a change is needed here too. */ free_node = binder_dec_node_nilocked(node, 0, 1); binder_node_inner_unlock(node); if (free_node) binder_free_node(node); } static void binder_put_node(struct binder_node *node) { binder_dec_node_tmpref(node); } static struct binder_ref *binder_get_ref_olocked(struct binder_proc *proc, u32 desc, bool need_strong_ref) { struct rb_node *n = proc->refs_by_desc.rb_node; struct binder_ref *ref; while (n) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (desc < ref->data.desc) { n = n->rb_left; } else if (desc > ref->data.desc) { n = n->rb_right; } else if (need_strong_ref && !ref->data.strong) { binder_user_error("tried to use weak ref as strong ref\n"); return NULL; } else { return ref; } } return NULL; } /* Find the smallest unused descriptor the "slow way" */ static u32 slow_desc_lookup_olocked(struct binder_proc *proc, u32 offset) { struct binder_ref *ref; struct rb_node *n; u32 desc; desc = offset; for (n = rb_first(&proc->refs_by_desc); n; n = rb_next(n)) { ref = rb_entry(n, struct binder_ref, rb_node_desc); if (ref->data.desc > desc) break; desc = ref->data.desc + 1; } return desc; } /* * Find an available reference descriptor ID. The proc->outer_lock might * be released in the process, in which case -EAGAIN is returned and the * @desc should be considered invalid. */ static int get_ref_desc_olocked(struct binder_proc *proc, struct binder_node *node, u32 *desc) { struct dbitmap *dmap = &proc->dmap; unsigned int nbits, offset; unsigned long *new, bit; /* 0 is reserved for the context manager */ offset = (node == proc->context->binder_context_mgr_node) ? 0 : 1; if (!dbitmap_enabled(dmap)) { *desc = slow_desc_lookup_olocked(proc, offset); return 0; } if (dbitmap_acquire_next_zero_bit(dmap, offset, &bit) == 0) { *desc = bit; return 0; } /* * The dbitmap is full and needs to grow. The proc->outer_lock * is briefly released to allocate the new bitmap safely. */ nbits = dbitmap_grow_nbits(dmap); binder_proc_unlock(proc); new = bitmap_zalloc(nbits, GFP_KERNEL); binder_proc_lock(proc); dbitmap_grow(dmap, new, nbits); return -EAGAIN; } /** * binder_get_ref_for_node_olocked() - get the ref associated with given node * @proc: binder_proc that owns the ref * @node: binder_node of target * @new_ref: newly allocated binder_ref to be initialized or %NULL * * Look up the ref for the given node and return it if it exists * * If it doesn't exist and the caller provides a newly allocated * ref, initialize the fields of the newly allocated ref and insert * into the given proc rb_trees and node refs list. * * Return: the ref for node. It is possible that another thread * allocated/initialized the ref first in which case the * returned ref would be different than the passed-in * new_ref. new_ref must be kfree'd by the caller in * this case. */ static struct binder_ref *binder_get_ref_for_node_olocked( struct binder_proc *proc, struct binder_node *node, struct binder_ref *new_ref) { struct binder_ref *ref; struct rb_node *parent; struct rb_node **p; u32 desc; retry: p = &proc->refs_by_node.rb_node; parent = NULL; while (*p) { parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_node); if (node < ref->node) p = &(*p)->rb_left; else if (node > ref->node) p = &(*p)->rb_right; else return ref; } if (!new_ref) return NULL; /* might release the proc->outer_lock */ if (get_ref_desc_olocked(proc, node, &desc) == -EAGAIN) goto retry; binder_stats_created(BINDER_STAT_REF); new_ref->data.debug_id = atomic_inc_return(&binder_last_id); new_ref->proc = proc; new_ref->node = node; rb_link_node(&new_ref->rb_node_node, parent, p); rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); new_ref->data.desc = desc; p = &proc->refs_by_desc.rb_node; while (*p) { parent = *p; ref = rb_entry(parent, struct binder_ref, rb_node_desc); if (new_ref->data.desc < ref->data.desc) p = &(*p)->rb_left; else if (new_ref->data.desc > ref->data.desc) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new_ref->rb_node_desc, parent, p); rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc); binder_node_lock(node); hlist_add_head(&new_ref->node_entry, &node->refs); binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d new ref %d desc %d for node %d\n", proc->pid, new_ref->data.debug_id, new_ref->data.desc, node->debug_id); binder_node_unlock(node); return new_ref; } static void binder_cleanup_ref_olocked(struct binder_ref *ref) { struct dbitmap *dmap = &ref->proc->dmap; bool delete_node = false; binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d delete ref %d desc %d for node %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->node->debug_id); if (dbitmap_enabled(dmap)) dbitmap_clear_bit(dmap, ref->data.desc); rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); binder_node_inner_lock(ref->node); if (ref->data.strong) binder_dec_node_nilocked(ref->node, 1, 1); hlist_del(&ref->node_entry); delete_node = binder_dec_node_nilocked(ref->node, 0, 1); binder_node_inner_unlock(ref->node); /* * Clear ref->node unless we want the caller to free the node */ if (!delete_node) { /* * The caller uses ref->node to determine * whether the node needs to be freed. Clear * it since the node is still alive. */ ref->node = NULL; } if (ref->death) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d delete ref %d desc %d has death notification\n", ref->proc->pid, ref->data.debug_id, ref->data.desc); binder_dequeue_work(ref->proc, &ref->death->work); binder_stats_deleted(BINDER_STAT_DEATH); } binder_stats_deleted(BINDER_STAT_REF); } /** * binder_inc_ref_olocked() - increment the ref for given handle * @ref: ref to be incremented * @strong: if true, strong increment, else weak * @target_list: list to queue node work on * * Increment the ref. @ref->proc->outer_lock must be held on entry * * Return: 0, if successful, else errno */ static int binder_inc_ref_olocked(struct binder_ref *ref, int strong, struct list_head *target_list) { int ret; if (strong) { if (ref->data.strong == 0) { ret = binder_inc_node(ref->node, 1, 1, target_list); if (ret) return ret; } ref->data.strong++; } else { if (ref->data.weak == 0) { ret = binder_inc_node(ref->node, 0, 1, target_list); if (ret) return ret; } ref->data.weak++; } return 0; } /** * binder_dec_ref_olocked() - dec the ref for given handle * @ref: ref to be decremented * @strong: if true, strong decrement, else weak * * Decrement the ref. * * Return: %true if ref is cleaned up and ready to be freed. */ static bool binder_dec_ref_olocked(struct binder_ref *ref, int strong) { if (strong) { if (ref->data.strong == 0) { binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak); return false; } ref->data.strong--; if (ref->data.strong == 0) binder_dec_node(ref->node, strong, 1); } else { if (ref->data.weak == 0) { binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n", ref->proc->pid, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak); return false; } ref->data.weak--; } if (ref->data.strong == 0 && ref->data.weak == 0) { binder_cleanup_ref_olocked(ref); return true; } return false; } /** * binder_get_node_from_ref() - get the node from the given proc/desc * @proc: proc containing the ref * @desc: the handle associated with the ref * @need_strong_ref: if true, only return node if ref is strong * @rdata: the id/refcount data for the ref * * Given a proc and ref handle, return the associated binder_node * * Return: a binder_node or NULL if not found or not strong when strong required */ static struct binder_node *binder_get_node_from_ref( struct binder_proc *proc, u32 desc, bool need_strong_ref, struct binder_ref_data *rdata) { struct binder_node *node; struct binder_ref *ref; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, desc, need_strong_ref); if (!ref) goto err_no_ref; node = ref->node; /* * Take an implicit reference on the node to ensure * it stays alive until the call to binder_put_node() */ binder_inc_node_tmpref(node); if (rdata) *rdata = ref->data; binder_proc_unlock(proc); return node; err_no_ref: binder_proc_unlock(proc); return NULL; } /** * binder_free_ref() - free the binder_ref * @ref: ref to free * * Free the binder_ref. Free the binder_node indicated by ref->node * (if non-NULL) and the binder_ref_death indicated by ref->death. */ static void binder_free_ref(struct binder_ref *ref) { if (ref->node) binder_free_node(ref->node); kfree(ref->death); kfree(ref); } /* shrink descriptor bitmap if needed */ static void try_shrink_dmap(struct binder_proc *proc) { unsigned long *new; int nbits; binder_proc_lock(proc); nbits = dbitmap_shrink_nbits(&proc->dmap); binder_proc_unlock(proc); if (!nbits) return; new = bitmap_zalloc(nbits, GFP_KERNEL); binder_proc_lock(proc); dbitmap_shrink(&proc->dmap, new, nbits); binder_proc_unlock(proc); } /** * binder_update_ref_for_handle() - inc/dec the ref for given handle * @proc: proc containing the ref * @desc: the handle associated with the ref * @increment: true=inc reference, false=dec reference * @strong: true=strong reference, false=weak reference * @rdata: the id/refcount data for the ref * * Given a proc and ref handle, increment or decrement the ref * according to "increment" arg. * * Return: 0 if successful, else errno */ static int binder_update_ref_for_handle(struct binder_proc *proc, uint32_t desc, bool increment, bool strong, struct binder_ref_data *rdata) { int ret = 0; struct binder_ref *ref; bool delete_ref = false; binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, desc, strong); if (!ref) { ret = -EINVAL; goto err_no_ref; } if (increment) ret = binder_inc_ref_olocked(ref, strong, NULL); else delete_ref = binder_dec_ref_olocked(ref, strong); if (rdata) *rdata = ref->data; binder_proc_unlock(proc); if (delete_ref) { binder_free_ref(ref); try_shrink_dmap(proc); } return ret; err_no_ref: binder_proc_unlock(proc); return ret; } /** * binder_dec_ref_for_handle() - dec the ref for given handle * @proc: proc containing the ref * @desc: the handle associated with the ref * @strong: true=strong reference, false=weak reference * @rdata: the id/refcount data for the ref * * Just calls binder_update_ref_for_handle() to decrement the ref. * * Return: 0 if successful, else errno */ static int binder_dec_ref_for_handle(struct binder_proc *proc, uint32_t desc, bool strong, struct binder_ref_data *rdata) { return binder_update_ref_for_handle(proc, desc, false, strong, rdata); } /** * binder_inc_ref_for_node() - increment the ref for given proc/node * @proc: proc containing the ref * @node: target node * @strong: true=strong reference, false=weak reference * @target_list: worklist to use if node is incremented * @rdata: the id/refcount data for the ref * * Given a proc and node, increment the ref. Create the ref if it * doesn't already exist * * Return: 0 if successful, else errno */ static int binder_inc_ref_for_node(struct binder_proc *proc, struct binder_node *node, bool strong, struct list_head *target_list, struct binder_ref_data *rdata) { struct binder_ref *ref; struct binder_ref *new_ref = NULL; int ret = 0; binder_proc_lock(proc); ref = binder_get_ref_for_node_olocked(proc, node, NULL); if (!ref) { binder_proc_unlock(proc); new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!new_ref) return -ENOMEM; binder_proc_lock(proc); ref = binder_get_ref_for_node_olocked(proc, node, new_ref); } ret = binder_inc_ref_olocked(ref, strong, target_list); *rdata = ref->data; if (ret && ref == new_ref) { /* * Cleanup the failed reference here as the target * could now be dead and have already released its * references by now. Calling on the new reference * with strong=0 and a tmp_refs will not decrement * the node. The new_ref gets kfree'd below. */ binder_cleanup_ref_olocked(new_ref); ref = NULL; } binder_proc_unlock(proc); if (new_ref && ref != new_ref) /* * Another thread created the ref first so * free the one we allocated */ kfree(new_ref); return ret; } static void binder_pop_transaction_ilocked(struct binder_thread *target_thread, struct binder_transaction *t) { BUG_ON(!target_thread); assert_spin_locked(&target_thread->proc->inner_lock); BUG_ON(target_thread->transaction_stack != t); BUG_ON(target_thread->transaction_stack->from != target_thread); target_thread->transaction_stack = target_thread->transaction_stack->from_parent; t->from = NULL; } /** * binder_thread_dec_tmpref() - decrement thread->tmp_ref * @thread: thread to decrement * * A thread needs to be kept alive while being used to create or * handle a transaction. binder_get_txn_from() is used to safely * extract t->from from a binder_transaction and keep the thread * indicated by t->from from being freed. When done with that * binder_thread, this function is called to decrement the * tmp_ref and free if appropriate (thread has been released * and no transaction being processed by the driver) */ static void binder_thread_dec_tmpref(struct binder_thread *thread) { /* * atomic is used to protect the counter value while * it cannot reach zero or thread->is_dead is false */ binder_inner_proc_lock(thread->proc); atomic_dec(&thread->tmp_ref); if (thread->is_dead && !atomic_read(&thread->tmp_ref)) { binder_inner_proc_unlock(thread->proc); binder_free_thread(thread); return; } binder_inner_proc_unlock(thread->proc); } /** * binder_proc_dec_tmpref() - decrement proc->tmp_ref * @proc: proc to decrement * * A binder_proc needs to be kept alive while being used to create or * handle a transaction. proc->tmp_ref is incremented when * creating a new transaction or the binder_proc is currently in-use * by threads that are being released. When done with the binder_proc, * this function is called to decrement the counter and free the * proc if appropriate (proc has been released, all threads have * been released and not currenly in-use to process a transaction). */ static void binder_proc_dec_tmpref(struct binder_proc *proc) { binder_inner_proc_lock(proc); proc->tmp_ref--; if (proc->is_dead && RB_EMPTY_ROOT(&proc->threads) && !proc->tmp_ref) { binder_inner_proc_unlock(proc); binder_free_proc(proc); return; } binder_inner_proc_unlock(proc); } /** * binder_get_txn_from() - safely extract the "from" thread in transaction * @t: binder transaction for t->from * * Atomically return the "from" thread and increment the tmp_ref * count for the thread to ensure it stays alive until * binder_thread_dec_tmpref() is called. * * Return: the value of t->from */ static struct binder_thread *binder_get_txn_from( struct binder_transaction *t) { struct binder_thread *from; spin_lock(&t->lock); from = t->from; if (from) atomic_inc(&from->tmp_ref); spin_unlock(&t->lock); return from; } /** * binder_get_txn_from_and_acq_inner() - get t->from and acquire inner lock * @t: binder transaction for t->from * * Same as binder_get_txn_from() except it also acquires the proc->inner_lock * to guarantee that the thread cannot be released while operating on it. * The caller must call binder_inner_proc_unlock() to release the inner lock * as well as call binder_dec_thread_txn() to release the reference. * * Return: the value of t->from */ static struct binder_thread *binder_get_txn_from_and_acq_inner( struct binder_transaction *t) __acquires(&t->from->proc->inner_lock) { struct binder_thread *from; from = binder_get_txn_from(t); if (!from) { __acquire(&from->proc->inner_lock); return NULL; } binder_inner_proc_lock(from->proc); if (t->from) { BUG_ON(from != t->from); return from; } binder_inner_proc_unlock(from->proc); __acquire(&from->proc->inner_lock); binder_thread_dec_tmpref(from); return NULL; } /** * binder_free_txn_fixups() - free unprocessed fd fixups * @t: binder transaction for t->from * * If the transaction is being torn down prior to being * processed by the target process, free all of the * fd fixups and fput the file structs. It is safe to * call this function after the fixups have been * processed -- in that case, the list will be empty. */ static void binder_free_txn_fixups(struct binder_transaction *t) { struct binder_txn_fd_fixup *fixup, *tmp; list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { fput(fixup->file); if (fixup->target_fd >= 0) put_unused_fd(fixup->target_fd); list_del(&fixup->fixup_entry); kfree(fixup); } } static void binder_txn_latency_free(struct binder_transaction *t) { int from_proc, from_thread, to_proc, to_thread; spin_lock(&t->lock); from_proc = t->from ? t->from->proc->pid : 0; from_thread = t->from ? t->from->pid : 0; to_proc = t->to_proc ? t->to_proc->pid : 0; to_thread = t->to_thread ? t->to_thread->pid : 0; spin_unlock(&t->lock); trace_binder_txn_latency_free(t, from_proc, from_thread, to_proc, to_thread); } static void binder_free_transaction(struct binder_transaction *t) { struct binder_proc *target_proc = t->to_proc; if (target_proc) { binder_inner_proc_lock(target_proc); target_proc->outstanding_txns--; if (target_proc->outstanding_txns < 0) pr_warn("%s: Unexpected outstanding_txns %d\n", __func__, target_proc->outstanding_txns); if (!target_proc->outstanding_txns && target_proc->is_frozen) wake_up_interruptible_all(&target_proc->freeze_wait); if (t->buffer) t->buffer->transaction = NULL; binder_inner_proc_unlock(target_proc); } if (trace_binder_txn_latency_free_enabled()) binder_txn_latency_free(t); /* * If the transaction has no target_proc, then * t->buffer->transaction has already been cleared. */ binder_free_txn_fixups(t); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); } static void binder_send_failed_reply(struct binder_transaction *t, uint32_t error_code) { struct binder_thread *target_thread; struct binder_transaction *next; BUG_ON(t->flags & TF_ONE_WAY); while (1) { target_thread = binder_get_txn_from_and_acq_inner(t); if (target_thread) { binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d to %d:%d\n", t->debug_id, target_thread->proc->pid, target_thread->pid); binder_pop_transaction_ilocked(target_thread, t); if (target_thread->reply_error.cmd == BR_OK) { target_thread->reply_error.cmd = error_code; binder_enqueue_thread_work_ilocked( target_thread, &target_thread->reply_error.work); wake_up_interruptible(&target_thread->wait); } else { /* * Cannot get here for normal operation, but * we can if multiple synchronous transactions * are sent without blocking for responses. * Just ignore the 2nd error in this case. */ pr_warn("Unexpected reply error: %u\n", target_thread->reply_error.cmd); } binder_inner_proc_unlock(target_thread->proc); binder_thread_dec_tmpref(target_thread); binder_free_transaction(t); return; } __release(&target_thread->proc->inner_lock); next = t->from_parent; binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "send failed reply for transaction %d, target dead\n", t->debug_id); binder_free_transaction(t); if (next == NULL) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread at root\n"); return; } t = next; binder_debug(BINDER_DEBUG_DEAD_BINDER, "reply failed, no target thread -- retry %d\n", t->debug_id); } } /** * binder_cleanup_transaction() - cleans up undelivered transaction * @t: transaction that needs to be cleaned up * @reason: reason the transaction wasn't delivered * @error_code: error to return to caller (if synchronous call) */ static void binder_cleanup_transaction(struct binder_transaction *t, const char *reason, uint32_t error_code) { if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) { binder_send_failed_reply(t, error_code); } else { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered transaction %d, %s\n", t->debug_id, reason); binder_free_transaction(t); } } /** * binder_get_object() - gets object and checks for valid metadata * @proc: binder_proc owning the buffer * @u: sender's user pointer to base of buffer * @buffer: binder_buffer that we're parsing. * @offset: offset in the @buffer at which to validate an object. * @object: struct binder_object to read into * * Copy the binder object at the given offset into @object. If @u is * provided then the copy is from the sender's buffer. If not, then * it is copied from the target's @buffer. * * Return: If there's a valid metadata object at @offset, the * size of that object. Otherwise, it returns zero. The object * is read into the struct binder_object pointed to by @object. */ static size_t binder_get_object(struct binder_proc *proc, const void __user *u, struct binder_buffer *buffer, unsigned long offset, struct binder_object *object) { size_t read_size; struct binder_object_header *hdr; size_t object_size = 0; read_size = min_t(size_t, sizeof(*object), buffer->data_size - offset); if (offset > buffer->data_size || read_size < sizeof(*hdr) || !IS_ALIGNED(offset, sizeof(u32))) return 0; if (u) { if (copy_from_user(object, u + offset, read_size)) return 0; } else { if (binder_alloc_copy_from_buffer(&proc->alloc, object, buffer, offset, read_size)) return 0; } /* Ok, now see if we read a complete object. */ hdr = &object->hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: object_size = sizeof(struct flat_binder_object); break; case BINDER_TYPE_FD: object_size = sizeof(struct binder_fd_object); break; case BINDER_TYPE_PTR: object_size = sizeof(struct binder_buffer_object); break; case BINDER_TYPE_FDA: object_size = sizeof(struct binder_fd_array_object); break; default: return 0; } if (offset <= buffer->data_size - object_size && buffer->data_size >= object_size) return object_size; else return 0; } /** * binder_validate_ptr() - validates binder_buffer_object in a binder_buffer. * @proc: binder_proc owning the buffer * @b: binder_buffer containing the object * @object: struct binder_object to read into * @index: index in offset array at which the binder_buffer_object is * located * @start_offset: points to the start of the offset array * @object_offsetp: offset of @object read from @b * @num_valid: the number of valid offsets in the offset array * * Return: If @index is within the valid range of the offset array * described by @start and @num_valid, and if there's a valid * binder_buffer_object at the offset found in index @index * of the offset array, that object is returned. Otherwise, * %NULL is returned. * Note that the offset found in index @index itself is not * verified; this function assumes that @num_valid elements * from @start were previously verified to have valid offsets. * If @object_offsetp is non-NULL, then the offset within * @b is written to it. */ static struct binder_buffer_object *binder_validate_ptr( struct binder_proc *proc, struct binder_buffer *b, struct binder_object *object, binder_size_t index, binder_size_t start_offset, binder_size_t *object_offsetp, binder_size_t num_valid) { size_t object_size; binder_size_t object_offset; unsigned long buffer_offset; if (index >= num_valid) return NULL; buffer_offset = start_offset + sizeof(binder_size_t) * index; if (binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, b, buffer_offset, sizeof(object_offset))) return NULL; object_size = binder_get_object(proc, NULL, b, object_offset, object); if (!object_size || object->hdr.type != BINDER_TYPE_PTR) return NULL; if (object_offsetp) *object_offsetp = object_offset; return &object->bbo; } /** * binder_validate_fixup() - validates pointer/fd fixups happen in order. * @proc: binder_proc owning the buffer * @b: transaction buffer * @objects_start_offset: offset to start of objects buffer * @buffer_obj_offset: offset to binder_buffer_object in which to fix up * @fixup_offset: start offset in @buffer to fix up * @last_obj_offset: offset to last binder_buffer_object that we fixed * @last_min_offset: minimum fixup offset in object at @last_obj_offset * * Return: %true if a fixup in buffer @buffer at offset @offset is * allowed. * * For safety reasons, we only allow fixups inside a buffer to happen * at increasing offsets; additionally, we only allow fixup on the last * buffer object that was verified, or one of its parents. * * Example of what is allowed: * * A * B (parent = A, offset = 0) * C (parent = A, offset = 16) * D (parent = C, offset = 0) * E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) * * Examples of what is not allowed: * * Decreasing offsets within the same parent: * A * C (parent = A, offset = 16) * B (parent = A, offset = 0) // decreasing offset within A * * Referring to a parent that wasn't the last object or any of its parents: * A * B (parent = A, offset = 0) * C (parent = A, offset = 0) * C (parent = A, offset = 16) * D (parent = B, offset = 0) // B is not A or any of A's parents */ static bool binder_validate_fixup(struct binder_proc *proc, struct binder_buffer *b, binder_size_t objects_start_offset, binder_size_t buffer_obj_offset, binder_size_t fixup_offset, binder_size_t last_obj_offset, binder_size_t last_min_offset) { if (!last_obj_offset) { /* Nothing to fix up in */ return false; } while (last_obj_offset != buffer_obj_offset) { unsigned long buffer_offset; struct binder_object last_object; struct binder_buffer_object *last_bbo; size_t object_size = binder_get_object(proc, NULL, b, last_obj_offset, &last_object); if (object_size != sizeof(*last_bbo)) return false; last_bbo = &last_object.bbo; /* * Safe to retrieve the parent of last_obj, since it * was already previously verified by the driver. */ if ((last_bbo->flags & BINDER_BUFFER_FLAG_HAS_PARENT) == 0) return false; last_min_offset = last_bbo->parent_offset + sizeof(uintptr_t); buffer_offset = objects_start_offset + sizeof(binder_size_t) * last_bbo->parent; if (binder_alloc_copy_from_buffer(&proc->alloc, &last_obj_offset, b, buffer_offset, sizeof(last_obj_offset))) return false; } return (fixup_offset >= last_min_offset); } /** * struct binder_task_work_cb - for deferred close * * @twork: callback_head for task work * @fd: fd to close * * Structure to pass task work to be handled after * returning from binder_ioctl() via task_work_add(). */ struct binder_task_work_cb { struct callback_head twork; struct file *file; }; /** * binder_do_fd_close() - close list of file descriptors * @twork: callback head for task work * * It is not safe to call ksys_close() during the binder_ioctl() * function if there is a chance that binder's own file descriptor * might be closed. This is to meet the requirements for using * fdget() (see comments for __fget_light()). Therefore use * task_work_add() to schedule the close operation once we have * returned from binder_ioctl(). This function is a callback * for that mechanism and does the actual ksys_close() on the * given file descriptor. */ static void binder_do_fd_close(struct callback_head *twork) { struct binder_task_work_cb *twcb = container_of(twork, struct binder_task_work_cb, twork); fput(twcb->file); kfree(twcb); } /** * binder_deferred_fd_close() - schedule a close for the given file-descriptor * @fd: file-descriptor to close * * See comments in binder_do_fd_close(). This function is used to schedule * a file-descriptor to be closed after returning from binder_ioctl(). */ static void binder_deferred_fd_close(int fd) { struct binder_task_work_cb *twcb; twcb = kzalloc(sizeof(*twcb), GFP_KERNEL); if (!twcb) return; init_task_work(&twcb->twork, binder_do_fd_close); twcb->file = file_close_fd(fd); if (twcb->file) { // pin it until binder_do_fd_close(); see comments there get_file(twcb->file); filp_close(twcb->file, current->files); task_work_add(current, &twcb->twork, TWA_RESUME); } else { kfree(twcb); } } static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, binder_size_t off_end_offset, bool is_failure) { int debug_id = buffer->debug_id; binder_size_t off_start_offset, buffer_offset; binder_debug(BINDER_DEBUG_TRANSACTION, "%d buffer release %d, size %zd-%zd, failed at %llx\n", proc->pid, buffer->debug_id, buffer->data_size, buffer->offsets_size, (unsigned long long)off_end_offset); if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size = 0; struct binder_object object; binder_size_t object_offset; if (!binder_alloc_copy_from_buffer(&proc->alloc, &object_offset, buffer, buffer_offset, sizeof(object_offset))) object_size = binder_get_object(proc, NULL, buffer, object_offset, &object); if (object_size == 0) { pr_err("transaction release %d bad object at offset %lld, size %zd\n", debug_id, (u64)object_offset, buffer->data_size); continue; } hdr = &object.hdr; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; struct binder_node *node; fp = to_flat_binder_object(hdr); node = binder_get_node(proc, fp->binder); if (node == NULL) { pr_err("transaction release %d bad node %016llx\n", debug_id, (u64)fp->binder); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx\n", node->debug_id, (u64)node->ptr); binder_dec_node(node, hdr->type == BINDER_TYPE_BINDER, 0); binder_put_node(node); } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; struct binder_ref_data rdata; int ret; fp = to_flat_binder_object(hdr); ret = binder_dec_ref_for_handle(proc, fp->handle, hdr->type == BINDER_TYPE_HANDLE, &rdata); if (ret) { pr_err("transaction release %d bad handle %d, ret = %d\n", debug_id, fp->handle, ret); break; } binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d\n", rdata.debug_id, rdata.desc); } break; case BINDER_TYPE_FD: { /* * No need to close the file here since user-space * closes it for successfully delivered * transactions. For transactions that weren't * delivered, the new fd was never allocated so * there is no need to close and the fput on the * file is done when the transaction is torn * down. */ } break; case BINDER_TYPE_PTR: /* * Nothing to do here, this will get cleaned up when the * transaction buffer gets freed */ break; case BINDER_TYPE_FDA: { struct binder_fd_array_object *fda; struct binder_buffer_object *parent; struct binder_object ptr_object; binder_size_t fda_offset; size_t fd_index; binder_size_t fd_buf_size; binder_size_t num_valid; if (is_failure) { /* * The fd fixups have not been applied so no * fds need to be closed. */ continue; } num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); fda = to_binder_fd_array_object(hdr); parent = binder_validate_ptr(proc, buffer, &ptr_object, fda->parent, off_start_offset, NULL, num_valid); if (!parent) { pr_err("transaction release %d bad parent offset\n", debug_id); continue; } fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { pr_err("transaction release %d invalid number of fds (%lld)\n", debug_id, (u64)fda->num_fds); continue; } if (fd_buf_size > parent->length || fda->parent_offset > parent->length - fd_buf_size) { /* No space for all file descriptors here. */ pr_err("transaction release %d not enough space for %lld fds in buffer\n", debug_id, (u64)fda->num_fds); continue; } /* * the source data for binder_buffer_object is visible * to user-space and the @buffer element is the user * pointer to the buffer_object containing the fd_array. * Convert the address to an offset relative to * the base of the transaction buffer. */ fda_offset = parent->buffer - buffer->user_data + fda->parent_offset; for (fd_index = 0; fd_index < fda->num_fds; fd_index++) { u32 fd; int err; binder_size_t offset = fda_offset + fd_index * sizeof(fd); err = binder_alloc_copy_from_buffer( &proc->alloc, &fd, buffer, offset, sizeof(fd)); WARN_ON(err); if (!err) { binder_deferred_fd_close(fd); /* * Need to make sure the thread goes * back to userspace to complete the * deferred close */ if (thread) thread->looper_need_return = true; } } } break; default: pr_err("transaction release %d bad object type %x\n", debug_id, hdr->type); break; } } } /* Clean up all the objects in the buffer */ static inline void binder_release_entire_buffer(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, bool is_failure) { binder_size_t off_end_offset; off_end_offset = ALIGN(buffer->data_size, sizeof(void *)); off_end_offset += buffer->offsets_size; binder_transaction_buffer_release(proc, thread, buffer, off_end_offset, is_failure); } static int binder_translate_binder(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { struct binder_node *node; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_ref_data rdata; int ret = 0; node = binder_get_node(proc, fp->binder); if (!node) { node = binder_new_node(proc, fp); if (!node) return -ENOMEM; } if (fp->cookie != node->cookie) { binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)fp->binder, node->debug_id, (u64)fp->cookie, (u64)node->cookie); ret = -EINVAL; goto done; } if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_BINDER, &thread->todo, &rdata); if (ret) goto done; if (fp->hdr.type == BINDER_TYPE_BINDER) fp->hdr.type = BINDER_TYPE_HANDLE; else fp->hdr.type = BINDER_TYPE_WEAK_HANDLE; fp->binder = 0; fp->handle = rdata.desc; fp->cookie = 0; trace_binder_transaction_node_to_ref(t, node, &rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " node %d u%016llx -> ref %d desc %d\n", node->debug_id, (u64)node->ptr, rdata.debug_id, rdata.desc); done: binder_put_node(node); return ret; } static int binder_translate_handle(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_node *node; struct binder_ref_data src_rdata; int ret = 0; node = binder_get_node_from_ref(proc, fp->handle, fp->hdr.type == BINDER_TYPE_HANDLE, &src_rdata); if (!node) { binder_user_error("%d:%d got transaction with invalid handle, %d\n", proc->pid, thread->pid, fp->handle); return -EINVAL; } if (security_binder_transfer_binder(proc->cred, target_proc->cred)) { ret = -EPERM; goto done; } binder_node_lock(node); if (node->proc == target_proc) { if (fp->hdr.type == BINDER_TYPE_HANDLE) fp->hdr.type = BINDER_TYPE_BINDER; else fp->hdr.type = BINDER_TYPE_WEAK_BINDER; fp->binder = node->ptr; fp->cookie = node->cookie; if (node->proc) binder_inner_proc_lock(node->proc); else __acquire(&node->proc->inner_lock); binder_inc_node_nilocked(node, fp->hdr.type == BINDER_TYPE_BINDER, 0, NULL); if (node->proc) binder_inner_proc_unlock(node->proc); else __release(&node->proc->inner_lock); trace_binder_transaction_ref_to_node(t, node, &src_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> node %d u%016llx\n", src_rdata.debug_id, src_rdata.desc, node->debug_id, (u64)node->ptr); binder_node_unlock(node); } else { struct binder_ref_data dest_rdata; binder_node_unlock(node); ret = binder_inc_ref_for_node(target_proc, node, fp->hdr.type == BINDER_TYPE_HANDLE, NULL, &dest_rdata); if (ret) goto done; fp->binder = 0; fp->handle = dest_rdata.desc; fp->cookie = 0; trace_binder_transaction_ref_to_ref(t, node, &src_rdata, &dest_rdata); binder_debug(BINDER_DEBUG_TRANSACTION, " ref %d desc %d -> ref %d desc %d (node %d)\n", src_rdata.debug_id, src_rdata.desc, dest_rdata.debug_id, dest_rdata.desc, node->debug_id); } done: binder_put_node(node); return ret; } static int binder_translate_fd(u32 fd, binder_size_t fd_offset, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_txn_fd_fixup *fixup; struct file *file; int ret = 0; bool target_allows_fd; if (in_reply_to) target_allows_fd = !!(in_reply_to->flags & TF_ACCEPT_FDS); else target_allows_fd = t->buffer->target_node->accept_fds; if (!target_allows_fd) { binder_user_error("%d:%d got %s with fd, %d, but target does not allow fds\n", proc->pid, thread->pid, in_reply_to ? "reply" : "transaction", fd); ret = -EPERM; goto err_fd_not_accepted; } file = fget(fd); if (!file) { binder_user_error("%d:%d got transaction with invalid fd, %d\n", proc->pid, thread->pid, fd); ret = -EBADF; goto err_fget; } ret = security_binder_transfer_file(proc->cred, target_proc->cred, file); if (ret < 0) { ret = -EPERM; goto err_security; } /* * Add fixup record for this transaction. The allocation * of the fd in the target needs to be done from a * target thread. */ fixup = kzalloc(sizeof(*fixup), GFP_KERNEL); if (!fixup) { ret = -ENOMEM; goto err_alloc; } fixup->file = file; fixup->offset = fd_offset; fixup->target_fd = -1; trace_binder_transaction_fd_send(t, fd, fixup->offset); list_add_tail(&fixup->fixup_entry, &t->fd_fixups); return ret; err_alloc: err_security: fput(file); err_fget: err_fd_not_accepted: return ret; } /** * struct binder_ptr_fixup - data to be fixed-up in target buffer * @offset offset in target buffer to fixup * @skip_size bytes to skip in copy (fixup will be written later) * @fixup_data data to write at fixup offset * @node list node * * This is used for the pointer fixup list (pf) which is created and consumed * during binder_transaction() and is only accessed locally. No * locking is necessary. * * The list is ordered by @offset. */ struct binder_ptr_fixup { binder_size_t offset; size_t skip_size; binder_uintptr_t fixup_data; struct list_head node; }; /** * struct binder_sg_copy - scatter-gather data to be copied * @offset offset in target buffer * @sender_uaddr user address in source buffer * @length bytes to copy * @node list node * * This is used for the sg copy list (sgc) which is created and consumed * during binder_transaction() and is only accessed locally. No * locking is necessary. * * The list is ordered by @offset. */ struct binder_sg_copy { binder_size_t offset; const void __user *sender_uaddr; size_t length; struct list_head node; }; /** * binder_do_deferred_txn_copies() - copy and fixup scatter-gather data * @alloc: binder_alloc associated with @buffer * @buffer: binder buffer in target process * @sgc_head: list_head of scatter-gather copy list * @pf_head: list_head of pointer fixup list * * Processes all elements of @sgc_head, applying fixups from @pf_head * and copying the scatter-gather data from the source process' user * buffer to the target's buffer. It is expected that the list creation * and processing all occurs during binder_transaction() so these lists * are only accessed in local context. * * Return: 0=success, else -errno */ static int binder_do_deferred_txn_copies(struct binder_alloc *alloc, struct binder_buffer *buffer, struct list_head *sgc_head, struct list_head *pf_head) { int ret = 0; struct binder_sg_copy *sgc, *tmpsgc; struct binder_ptr_fixup *tmppf; struct binder_ptr_fixup *pf = list_first_entry_or_null(pf_head, struct binder_ptr_fixup, node); list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { size_t bytes_copied = 0; while (bytes_copied < sgc->length) { size_t copy_size; size_t bytes_left = sgc->length - bytes_copied; size_t offset = sgc->offset + bytes_copied; /* * We copy up to the fixup (pointed to by pf) */ copy_size = pf ? min(bytes_left, (size_t)pf->offset - offset) : bytes_left; if (!ret && copy_size) ret = binder_alloc_copy_user_to_buffer( alloc, buffer, offset, sgc->sender_uaddr + bytes_copied, copy_size); bytes_copied += copy_size; if (copy_size != bytes_left) { BUG_ON(!pf); /* we stopped at a fixup offset */ if (pf->skip_size) { /* * we are just skipping. This is for * BINDER_TYPE_FDA where the translated * fds will be fixed up when we get * to target context. */ bytes_copied += pf->skip_size; } else { /* apply the fixup indicated by pf */ if (!ret) ret = binder_alloc_copy_to_buffer( alloc, buffer, pf->offset, &pf->fixup_data, sizeof(pf->fixup_data)); bytes_copied += sizeof(pf->fixup_data); } list_del(&pf->node); kfree(pf); pf = list_first_entry_or_null(pf_head, struct binder_ptr_fixup, node); } } list_del(&sgc->node); kfree(sgc); } list_for_each_entry_safe(pf, tmppf, pf_head, node) { BUG_ON(pf->skip_size == 0); list_del(&pf->node); kfree(pf); } BUG_ON(!list_empty(sgc_head)); return ret > 0 ? -EINVAL : ret; } /** * binder_cleanup_deferred_txn_lists() - free specified lists * @sgc_head: list_head of scatter-gather copy list * @pf_head: list_head of pointer fixup list * * Called to clean up @sgc_head and @pf_head if there is an * error. */ static void binder_cleanup_deferred_txn_lists(struct list_head *sgc_head, struct list_head *pf_head) { struct binder_sg_copy *sgc, *tmpsgc; struct binder_ptr_fixup *pf, *tmppf; list_for_each_entry_safe(sgc, tmpsgc, sgc_head, node) { list_del(&sgc->node); kfree(sgc); } list_for_each_entry_safe(pf, tmppf, pf_head, node) { list_del(&pf->node); kfree(pf); } } /** * binder_defer_copy() - queue a scatter-gather buffer for copy * @sgc_head: list_head of scatter-gather copy list * @offset: binder buffer offset in target process * @sender_uaddr: user address in source process * @length: bytes to copy * * Specify a scatter-gather block to be copied. The actual copy must * be deferred until all the needed fixups are identified and queued. * Then the copy and fixups are done together so un-translated values * from the source are never visible in the target buffer. * * We are guaranteed that repeated calls to this function will have * monotonically increasing @offset values so the list will naturally * be ordered. * * Return: 0=success, else -errno */ static int binder_defer_copy(struct list_head *sgc_head, binder_size_t offset, const void __user *sender_uaddr, size_t length) { struct binder_sg_copy *bc = kzalloc(sizeof(*bc), GFP_KERNEL); if (!bc) return -ENOMEM; bc->offset = offset; bc->sender_uaddr = sender_uaddr; bc->length = length; INIT_LIST_HEAD(&bc->node); /* * We are guaranteed that the deferred copies are in-order * so just add to the tail. */ list_add_tail(&bc->node, sgc_head); return 0; } /** * binder_add_fixup() - queue a fixup to be applied to sg copy * @pf_head: list_head of binder ptr fixup list * @offset: binder buffer offset in target process * @fixup: bytes to be copied for fixup * @skip_size: bytes to skip when copying (fixup will be applied later) * * Add the specified fixup to a list ordered by @offset. When copying * the scatter-gather buffers, the fixup will be copied instead of * data from the source buffer. For BINDER_TYPE_FDA fixups, the fixup * will be applied later (in target process context), so we just skip * the bytes specified by @skip_size. If @skip_size is 0, we copy the * value in @fixup. * * This function is called *mostly* in @offset order, but there are * exceptions. Since out-of-order inserts are relatively uncommon, * we insert the new element by searching backward from the tail of * the list. * * Return: 0=success, else -errno */ static int binder_add_fixup(struct list_head *pf_head, binder_size_t offset, binder_uintptr_t fixup, size_t skip_size) { struct binder_ptr_fixup *pf = kzalloc(sizeof(*pf), GFP_KERNEL); struct binder_ptr_fixup *tmppf; if (!pf) return -ENOMEM; pf->offset = offset; pf->fixup_data = fixup; pf->skip_size = skip_size; INIT_LIST_HEAD(&pf->node); /* Fixups are *mostly* added in-order, but there are some * exceptions. Look backwards through list for insertion point. */ list_for_each_entry_reverse(tmppf, pf_head, node) { if (tmppf->offset < pf->offset) { list_add(&pf->node, &tmppf->node); return 0; } } /* * if we get here, then the new offset is the lowest so * insert at the head */ list_add(&pf->node, pf_head); return 0; } static int binder_translate_fd_array(struct list_head *pf_head, struct binder_fd_array_object *fda, const void __user *sender_ubuffer, struct binder_buffer_object *parent, struct binder_buffer_object *sender_uparent, struct binder_transaction *t, struct binder_thread *thread, struct binder_transaction *in_reply_to) { binder_size_t fdi, fd_buf_size; binder_size_t fda_offset; const void __user *sender_ufda_base; struct binder_proc *proc = thread->proc; int ret; if (fda->num_fds == 0) return 0; fd_buf_size = sizeof(u32) * fda->num_fds; if (fda->num_fds >= SIZE_MAX / sizeof(u32)) { binder_user_error("%d:%d got transaction with invalid number of fds (%lld)\n", proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } if (fd_buf_size > parent->length || fda->parent_offset > parent->length - fd_buf_size) { /* No space for all file descriptors here. */ binder_user_error("%d:%d not enough space to store %lld fds in buffer\n", proc->pid, thread->pid, (u64)fda->num_fds); return -EINVAL; } /* * the source data for binder_buffer_object is visible * to user-space and the @buffer element is the user * pointer to the buffer_object containing the fd_array. * Convert the address to an offset relative to * the base of the transaction buffer. */ fda_offset = parent->buffer - t->buffer->user_data + fda->parent_offset; sender_ufda_base = (void __user *)(uintptr_t)sender_uparent->buffer + fda->parent_offset; if (!IS_ALIGNED((unsigned long)fda_offset, sizeof(u32)) || !IS_ALIGNED((unsigned long)sender_ufda_base, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); return -EINVAL; } ret = binder_add_fixup(pf_head, fda_offset, 0, fda->num_fds * sizeof(u32)); if (ret) return ret; for (fdi = 0; fdi < fda->num_fds; fdi++) { u32 fd; binder_size_t offset = fda_offset + fdi * sizeof(fd); binder_size_t sender_uoffset = fdi * sizeof(fd); ret = copy_from_user(&fd, sender_ufda_base + sender_uoffset, sizeof(fd)); if (!ret) ret = binder_translate_fd(fd, offset, t, thread, in_reply_to); if (ret) return ret > 0 ? -EINVAL : ret; } return 0; } static int binder_fixup_parent(struct list_head *pf_head, struct binder_transaction *t, struct binder_thread *thread, struct binder_buffer_object *bp, binder_size_t off_start_offset, binder_size_t num_valid, binder_size_t last_fixup_obj_off, binder_size_t last_fixup_min_off) { struct binder_buffer_object *parent; struct binder_buffer *b = t->buffer; struct binder_proc *proc = thread->proc; struct binder_proc *target_proc = t->to_proc; struct binder_object object; binder_size_t buffer_offset; binder_size_t parent_offset; if (!(bp->flags & BINDER_BUFFER_FLAG_HAS_PARENT)) return 0; parent = binder_validate_ptr(target_proc, b, &object, bp->parent, off_start_offset, &parent_offset, num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return -EINVAL; } if (!binder_validate_fixup(target_proc, b, off_start_offset, parent_offset, bp->parent_offset, last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return -EINVAL; } if (parent->length < sizeof(binder_uintptr_t) || bp->parent_offset > parent->length - sizeof(binder_uintptr_t)) { /* No space for a pointer here! */ binder_user_error("%d:%d got transaction with invalid parent offset\n", proc->pid, thread->pid); return -EINVAL; } buffer_offset = bp->parent_offset + parent->buffer - b->user_data; return binder_add_fixup(pf_head, buffer_offset, bp->buffer, 0); } /** * binder_can_update_transaction() - Can a txn be superseded by an updated one? * @t1: the pending async txn in the frozen process * @t2: the new async txn to supersede the outdated pending one * * Return: true if t2 can supersede t1 * false if t2 can not supersede t1 */ static bool binder_can_update_transaction(struct binder_transaction *t1, struct binder_transaction *t2) { if ((t1->flags & t2->flags & (TF_ONE_WAY | TF_UPDATE_TXN)) != (TF_ONE_WAY | TF_UPDATE_TXN) || !t1->to_proc || !t2->to_proc) return false; if (t1->to_proc->tsk == t2->to_proc->tsk && t1->code == t2->code && t1->flags == t2->flags && t1->buffer->pid == t2->buffer->pid && t1->buffer->target_node->ptr == t2->buffer->target_node->ptr && t1->buffer->target_node->cookie == t2->buffer->target_node->cookie) return true; return false; } /** * binder_find_outdated_transaction_ilocked() - Find the outdated transaction * @t: new async transaction * @target_list: list to find outdated transaction * * Return: the outdated transaction if found * NULL if no outdated transacton can be found * * Requires the proc->inner_lock to be held. */ static struct binder_transaction * binder_find_outdated_transaction_ilocked(struct binder_transaction *t, struct list_head *target_list) { struct binder_work *w; list_for_each_entry(w, target_list, entry) { struct binder_transaction *t_queued; if (w->type != BINDER_WORK_TRANSACTION) continue; t_queued = container_of(w, struct binder_transaction, work); if (binder_can_update_transaction(t_queued, t)) return t_queued; } return NULL; } /** * binder_proc_transaction() - sends a transaction to a process and wakes it up * @t: transaction to send * @proc: process to send the transaction to * @thread: thread in @proc to send the transaction to (may be NULL) * * This function queues a transaction to the specified process. It will try * to find a thread in the target process to handle the transaction and * wake it up. If no thread is found, the work is queued to the proc * waitqueue. * * If the @thread parameter is not NULL, the transaction is always queued * to the waitlist of that specific thread. * * Return: 0 if the transaction was successfully queued * BR_DEAD_REPLY if the target process or thread is dead * BR_FROZEN_REPLY if the target process or thread is frozen and * the sync transaction was rejected * BR_TRANSACTION_PENDING_FROZEN if the target process is frozen * and the async transaction was successfully queued */ static int binder_proc_transaction(struct binder_transaction *t, struct binder_proc *proc, struct binder_thread *thread) { struct binder_node *node = t->buffer->target_node; bool oneway = !!(t->flags & TF_ONE_WAY); bool pending_async = false; struct binder_transaction *t_outdated = NULL; bool frozen = false; BUG_ON(!node); binder_node_lock(node); if (oneway) { BUG_ON(thread); if (node->has_async_transaction) pending_async = true; else node->has_async_transaction = true; } binder_inner_proc_lock(proc); if (proc->is_frozen) { frozen = true; proc->sync_recv |= !oneway; proc->async_recv |= oneway; } if ((frozen && !oneway) || proc->is_dead || (thread && thread->is_dead)) { binder_inner_proc_unlock(proc); binder_node_unlock(node); return frozen ? BR_FROZEN_REPLY : BR_DEAD_REPLY; } if (!thread && !pending_async) thread = binder_select_thread_ilocked(proc); if (thread) { binder_enqueue_thread_work_ilocked(thread, &t->work); } else if (!pending_async) { binder_enqueue_work_ilocked(&t->work, &proc->todo); } else { if ((t->flags & TF_UPDATE_TXN) && frozen) { t_outdated = binder_find_outdated_transaction_ilocked(t, &node->async_todo); if (t_outdated) { binder_debug(BINDER_DEBUG_TRANSACTION, "txn %d supersedes %d\n", t->debug_id, t_outdated->debug_id); list_del_init(&t_outdated->work.entry); proc->outstanding_txns--; } } binder_enqueue_work_ilocked(&t->work, &node->async_todo); } if (!pending_async) binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); proc->outstanding_txns++; binder_inner_proc_unlock(proc); binder_node_unlock(node); /* * To reduce potential contention, free the outdated transaction and * buffer after releasing the locks. */ if (t_outdated) { struct binder_buffer *buffer = t_outdated->buffer; t_outdated->buffer = NULL; buffer->transaction = NULL; trace_binder_transaction_update_buffer_release(buffer); binder_release_entire_buffer(proc, NULL, buffer, false); binder_alloc_free_buf(&proc->alloc, buffer); kfree(t_outdated); binder_stats_deleted(BINDER_STAT_TRANSACTION); } if (oneway && frozen) return BR_TRANSACTION_PENDING_FROZEN; return 0; } /** * binder_get_node_refs_for_txn() - Get required refs on node for txn * @node: struct binder_node for which to get refs * @procp: returns @node->proc if valid * @error: if no @procp then returns BR_DEAD_REPLY * * User-space normally keeps the node alive when creating a transaction * since it has a reference to the target. The local strong ref keeps it * alive if the sending process dies before the target process processes * the transaction. If the source process is malicious or has a reference * counting bug, relying on the local strong ref can fail. * * Since user-space can cause the local strong ref to go away, we also take * a tmpref on the node to ensure it survives while we are constructing * the transaction. We also need a tmpref on the proc while we are * constructing the transaction, so we take that here as well. * * Return: The target_node with refs taken or NULL if no @node->proc is NULL. * Also sets @procp if valid. If the @node->proc is NULL indicating that the * target proc has died, @error is set to BR_DEAD_REPLY. */ static struct binder_node *binder_get_node_refs_for_txn( struct binder_node *node, struct binder_proc **procp, uint32_t *error) { struct binder_node *target_node = NULL; binder_node_inner_lock(node); if (node->proc) { target_node = node; binder_inc_node_nilocked(node, 1, 0, NULL); binder_inc_node_tmpref_ilocked(node); node->proc->tmp_ref++; *procp = node->proc; } else *error = BR_DEAD_REPLY; binder_node_inner_unlock(node); return target_node; } static void binder_set_txn_from_error(struct binder_transaction *t, int id, uint32_t command, int32_t param) { struct binder_thread *from = binder_get_txn_from_and_acq_inner(t); if (!from) { /* annotation for sparse */ __release(&from->proc->inner_lock); return; } /* don't override existing errors */ if (from->ee.command == BR_OK) binder_set_extended_error(&from->ee, id, command, param); binder_inner_proc_unlock(from->proc); binder_thread_dec_tmpref(from); } static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, binder_size_t extra_buffers_size) { int ret; struct binder_transaction *t; struct binder_work *w; struct binder_work *tcomplete; binder_size_t buffer_offset = 0; binder_size_t off_start_offset, off_end_offset; binder_size_t off_min; binder_size_t sg_buf_offset, sg_buf_end_offset; binder_size_t user_offset = 0; struct binder_proc *target_proc = NULL; struct binder_thread *target_thread = NULL; struct binder_node *target_node = NULL; struct binder_transaction *in_reply_to = NULL; struct binder_transaction_log_entry *e; uint32_t return_error = 0; uint32_t return_error_param = 0; uint32_t return_error_line = 0; binder_size_t last_fixup_obj_off = 0; binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); ktime_t t_start_time = ktime_get(); char *secctx = NULL; u32 secctx_sz = 0; struct list_head sgc_head; struct list_head pf_head; const void __user *user_buffer = (const void __user *) (uintptr_t)tr->data.ptr.buffer; INIT_LIST_HEAD(&sgc_head); INIT_LIST_HEAD(&pf_head); e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); e->from_proc = proc->pid; e->from_thread = thread->pid; e->target_handle = tr->target.handle; e->data_size = tr->data_size; e->offsets_size = tr->offsets_size; strscpy(e->context_name, proc->context->name, BINDERFS_MAX_NAME); binder_inner_proc_lock(proc); binder_set_extended_error(&thread->ee, t_debug_id, BR_OK, 0); binder_inner_proc_unlock(proc); if (reply) { binder_inner_proc_lock(proc); in_reply_to = thread->transaction_stack; if (in_reply_to == NULL) { binder_inner_proc_unlock(proc); binder_user_error("%d:%d got reply transaction with no transaction stack\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_empty_call_stack; } if (in_reply_to->to_thread != thread) { spin_lock(&in_reply_to->lock); binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, in_reply_to->debug_id, in_reply_to->to_proc ? in_reply_to->to_proc->pid : 0, in_reply_to->to_thread ? in_reply_to->to_thread->pid : 0); spin_unlock(&in_reply_to->lock); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; in_reply_to = NULL; goto err_bad_call_stack; } thread->transaction_stack = in_reply_to->to_parent; binder_inner_proc_unlock(proc); binder_set_nice(in_reply_to->saved_priority); target_thread = binder_get_txn_from_and_acq_inner(in_reply_to); if (target_thread == NULL) { /* annotation for sparse */ __release(&target_thread->proc->inner_lock); binder_txn_error("%d:%d reply target not found\n", thread->pid, proc->pid); return_error = BR_DEAD_REPLY; return_error_line = __LINE__; goto err_dead_binder; } if (target_thread->transaction_stack != in_reply_to) { binder_user_error("%d:%d got reply transaction with bad target transaction stack %d, expected %d\n", proc->pid, thread->pid, target_thread->transaction_stack ? target_thread->transaction_stack->debug_id : 0, in_reply_to->debug_id); binder_inner_proc_unlock(target_thread->proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; in_reply_to = NULL; target_thread = NULL; goto err_dead_binder; } target_proc = target_thread->proc; target_proc->tmp_ref++; binder_inner_proc_unlock(target_thread->proc); } else { if (tr->target.handle) { struct binder_ref *ref; /* * There must already be a strong ref * on this node. If so, do a strong * increment on the node to ensure it * stays alive until the transaction is * done. */ binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, tr->target.handle, true); if (ref) { target_node = binder_get_node_refs_for_txn( ref->node, &target_proc, &return_error); } else { binder_user_error("%d:%d got transaction to invalid handle, %u\n", proc->pid, thread->pid, tr->target.handle); return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; if (target_node) target_node = binder_get_node_refs_for_txn( target_node, &target_proc, &return_error); else return_error = BR_DEAD_REPLY; mutex_unlock(&context->context_mgr_node_lock); if (target_node && target_proc->pid == proc->pid) { binder_user_error("%d:%d got transaction to context manager from process owning it\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_invalid_target_handle; } } if (!target_node) { binder_txn_error("%d:%d cannot find target node\n", thread->pid, proc->pid); /* * return_error is set above */ return_error_param = -EINVAL; return_error_line = __LINE__; goto err_dead_binder; } e->to_node = target_node->debug_id; if (WARN_ON(proc == target_proc)) { binder_txn_error("%d:%d self transactions not allowed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_invalid_target_handle; } if (security_binder_transaction(proc->cred, target_proc->cred) < 0) { binder_txn_error("%d:%d transaction credentials failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EPERM; return_error_line = __LINE__; goto err_invalid_target_handle; } binder_inner_proc_lock(proc); w = list_first_entry_or_null(&thread->todo, struct binder_work, entry); if (!(tr->flags & TF_ONE_WAY) && w && w->type == BINDER_WORK_TRANSACTION) { /* * Do not allow new outgoing transaction from a * thread that has a transaction at the head of * its todo list. Only need to check the head * because binder_select_thread_ilocked picks a * thread from proc->waiting_threads to enqueue * the transaction, and nothing is queued to the * todo list while the thread is on waiting_threads. */ binder_user_error("%d:%d new transaction not allowed when there is a transaction on thread todo\n", proc->pid, thread->pid); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_todo_list; } if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { struct binder_transaction *tmp; tmp = thread->transaction_stack; if (tmp->to_thread != thread) { spin_lock(&tmp->lock); binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n", proc->pid, thread->pid, tmp->debug_id, tmp->to_proc ? tmp->to_proc->pid : 0, tmp->to_thread ? tmp->to_thread->pid : 0); spin_unlock(&tmp->lock); binder_inner_proc_unlock(proc); return_error = BR_FAILED_REPLY; return_error_param = -EPROTO; return_error_line = __LINE__; goto err_bad_call_stack; } while (tmp) { struct binder_thread *from; spin_lock(&tmp->lock); from = tmp->from; if (from && from->proc == target_proc) { atomic_inc(&from->tmp_ref); target_thread = from; spin_unlock(&tmp->lock); break; } spin_unlock(&tmp->lock); tmp = tmp->from_parent; } } binder_inner_proc_unlock(proc); } if (target_thread) e->to_thread = target_thread->pid; e->to_proc = target_proc->pid; /* TODO: reuse incoming transaction for reply */ t = kzalloc(sizeof(*t), GFP_KERNEL); if (t == NULL) { binder_txn_error("%d:%d cannot allocate transaction\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_t_failed; } INIT_LIST_HEAD(&t->fd_fixups); binder_stats_created(BINDER_STAT_TRANSACTION); spin_lock_init(&t->lock); tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); if (tcomplete == NULL) { binder_txn_error("%d:%d cannot allocate work for transaction\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -ENOMEM; return_error_line = __LINE__; goto err_alloc_tcomplete_failed; } binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); t->debug_id = t_debug_id; t->start_time = t_start_time; if (reply) binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_thread->pid, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, (u64)tr->data_size, (u64)tr->offsets_size, (u64)extra_buffers_size); else binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld-%lld\n", proc->pid, thread->pid, t->debug_id, target_proc->pid, target_node->debug_id, (u64)tr->data.ptr.buffer, (u64)tr->data.ptr.offsets, (u64)tr->data_size, (u64)tr->offsets_size, (u64)extra_buffers_size); if (!reply && !(tr->flags & TF_ONE_WAY)) t->from = thread; else t->from = NULL; t->from_pid = proc->pid; t->from_tid = thread->pid; t->sender_euid = task_euid(proc->tsk); t->to_proc = target_proc; t->to_thread = target_thread; t->code = tr->code; t->flags = tr->flags; t->priority = task_nice(current); if (target_node && target_node->txn_security_ctx) { u32 secid; size_t added_size; security_cred_getsecid(proc->cred, &secid); ret = security_secid_to_secctx(secid, &secctx, &secctx_sz); if (ret) { binder_txn_error("%d:%d failed to get security context\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_get_secctx_failed; } added_size = ALIGN(secctx_sz, sizeof(u64)); extra_buffers_size += added_size; if (extra_buffers_size < added_size) { binder_txn_error("%d:%d integer overflow of extra_buffers_size\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_extra_size; } } trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, tr->offsets_size, extra_buffers_size, !reply && (t->flags & TF_ONE_WAY)); if (IS_ERR(t->buffer)) { char *s; ret = PTR_ERR(t->buffer); s = (ret == -ESRCH) ? ": vma cleared, target dead or dying" : (ret == -ENOSPC) ? ": no space left" : (ret == -ENOMEM) ? ": memory allocation failed" : ""; binder_txn_error("cannot allocate buffer%s", s); return_error_param = PTR_ERR(t->buffer); return_error = return_error_param == -ESRCH ? BR_DEAD_REPLY : BR_FAILED_REPLY; return_error_line = __LINE__; t->buffer = NULL; goto err_binder_alloc_buf_failed; } if (secctx) { int err; size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) + ALIGN(tr->offsets_size, sizeof(void *)) + ALIGN(extra_buffers_size, sizeof(void *)) - ALIGN(secctx_sz, sizeof(u64)); t->security_ctx = t->buffer->user_data + buf_offset; err = binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, buf_offset, secctx, secctx_sz); if (err) { t->security_ctx = 0; WARN_ON(1); } security_release_secctx(secctx, secctx_sz); secctx = NULL; } t->buffer->debug_id = t->debug_id; t->buffer->transaction = t; t->buffer->target_node = target_node; t->buffer->clear_on_free = !!(t->flags & TF_CLEAR_BUF); trace_binder_transaction_alloc_buf(t->buffer); if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, ALIGN(tr->data_size, sizeof(void *)), (const void __user *) (uintptr_t)tr->data.ptr.offsets, tr->offsets_size)) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) { binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n", proc->pid, thread->pid, (u64)tr->offsets_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } if (!IS_ALIGNED(extra_buffers_size, sizeof(u64))) { binder_user_error("%d:%d got transaction with unaligned buffers size, %lld\n", proc->pid, thread->pid, (u64)extra_buffers_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } off_start_offset = ALIGN(tr->data_size, sizeof(void *)); buffer_offset = off_start_offset; off_end_offset = off_start_offset + tr->offsets_size; sg_buf_offset = ALIGN(off_end_offset, sizeof(void *)); sg_buf_end_offset = sg_buf_offset + extra_buffers_size - ALIGN(secctx_sz, sizeof(u64)); off_min = 0; for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; size_t object_size; struct binder_object object; binder_size_t object_offset; binder_size_t copy_size; if (binder_alloc_copy_from_buffer(&target_proc->alloc, &object_offset, t->buffer, buffer_offset, sizeof(object_offset))) { binder_txn_error("%d:%d copy offset from buffer failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } /* * Copy the source user buffer up to the next object * that will be processed. */ copy_size = object_offset - user_offset; if (copy_size && (user_offset > object_offset || binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, user_offset, user_buffer + user_offset, copy_size))) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } object_size = binder_get_object(target_proc, user_buffer, t->buffer, object_offset, &object); if (object_size == 0 || object_offset < off_min) { binder_user_error("%d:%d got transaction with invalid offset (%lld, min %lld max %lld) or object.\n", proc->pid, thread->pid, (u64)object_offset, (u64)off_min, (u64)t->buffer->data_size); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } /* * Set offset to the next buffer fragment to be * copied */ user_offset = object_offset + object_size; hdr = &object.hdr; off_min = object_offset + object_size; switch (hdr->type) { case BINDER_TYPE_BINDER: case BINDER_TYPE_WEAK_BINDER: { struct flat_binder_object *fp; fp = to_flat_binder_object(hdr); ret = binder_translate_binder(fp, t, thread); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate binder failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_HANDLE: case BINDER_TYPE_WEAK_HANDLE: { struct flat_binder_object *fp; fp = to_flat_binder_object(hdr); ret = binder_translate_handle(fp, t, thread); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate handle failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_FD: { struct binder_fd_object *fp = to_binder_fd_object(hdr); binder_size_t fd_offset = object_offset + (uintptr_t)&fp->fd - (uintptr_t)fp; int ret = binder_translate_fd(fp->fd, fd_offset, t, thread, in_reply_to); fp->pad_binder = 0; if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fp, sizeof(*fp))) { binder_txn_error("%d:%d translate fd failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } } break; case BINDER_TYPE_FDA: { struct binder_object ptr_object; binder_size_t parent_offset; struct binder_object user_object; size_t user_parent_size; struct binder_fd_array_object *fda = to_binder_fd_array_object(hdr); size_t num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); struct binder_buffer_object *parent = binder_validate_ptr(target_proc, t->buffer, &ptr_object, fda->parent, off_start_offset, &parent_offset, num_valid); if (!parent) { binder_user_error("%d:%d got transaction with invalid parent offset or type\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } if (!binder_validate_fixup(target_proc, t->buffer, off_start_offset, parent_offset, fda->parent_offset, last_fixup_obj_off, last_fixup_min_off)) { binder_user_error("%d:%d got transaction with out-of-order buffer fixup\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } /* * We need to read the user version of the parent * object to get the original user offset */ user_parent_size = binder_get_object(proc, user_buffer, t->buffer, parent_offset, &user_object); if (user_parent_size != sizeof(user_object.bbo)) { binder_user_error("%d:%d invalid ptr object size: %zd vs %zd\n", proc->pid, thread->pid, user_parent_size, sizeof(user_object.bbo)); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_parent; } ret = binder_translate_fd_array(&pf_head, fda, user_buffer, parent, &user_object.bbo, t, thread, in_reply_to); if (!ret) ret = binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, fda, sizeof(*fda)); if (ret) { binder_txn_error("%d:%d translate fd array failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret > 0 ? -EINVAL : ret; return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj_off = parent_offset; last_fixup_min_off = fda->parent_offset + sizeof(u32) * fda->num_fds; } break; case BINDER_TYPE_PTR: { struct binder_buffer_object *bp = to_binder_buffer_object(hdr); size_t buf_left = sg_buf_end_offset - sg_buf_offset; size_t num_valid; if (bp->length > buf_left) { binder_user_error("%d:%d got transaction with too large buffer\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_offset; } ret = binder_defer_copy(&sgc_head, sg_buf_offset, (const void __user *)(uintptr_t)bp->buffer, bp->length); if (ret) { binder_txn_error("%d:%d deferred copy failed\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } /* Fixup buffer pointer to target proc address space */ bp->buffer = t->buffer->user_data + sg_buf_offset; sg_buf_offset += ALIGN(bp->length, sizeof(u64)); num_valid = (buffer_offset - off_start_offset) / sizeof(binder_size_t); ret = binder_fixup_parent(&pf_head, t, thread, bp, off_start_offset, num_valid, last_fixup_obj_off, last_fixup_min_off); if (ret < 0 || binder_alloc_copy_to_buffer(&target_proc->alloc, t->buffer, object_offset, bp, sizeof(*bp))) { binder_txn_error("%d:%d failed to fixup parent\n", thread->pid, proc->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_translate_failed; } last_fixup_obj_off = object_offset; last_fixup_min_off = 0; } break; default: binder_user_error("%d:%d got transaction with invalid object type, %x\n", proc->pid, thread->pid, hdr->type); return_error = BR_FAILED_REPLY; return_error_param = -EINVAL; return_error_line = __LINE__; goto err_bad_object_type; } } /* Done processing objects, copy the rest of the buffer */ if (binder_alloc_copy_user_to_buffer( &target_proc->alloc, t->buffer, user_offset, user_buffer + user_offset, tr->data_size - user_offset)) { binder_user_error("%d:%d got transaction with invalid data ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = -EFAULT; return_error_line = __LINE__; goto err_copy_data_failed; } ret = binder_do_deferred_txn_copies(&target_proc->alloc, t->buffer, &sgc_head, &pf_head); if (ret) { binder_user_error("%d:%d got transaction with invalid offsets ptr\n", proc->pid, thread->pid); return_error = BR_FAILED_REPLY; return_error_param = ret; return_error_line = __LINE__; goto err_copy_data_failed; } if (t->buffer->oneway_spam_suspect) tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; else tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; t->work.type = BINDER_WORK_TRANSACTION; if (reply) { binder_enqueue_thread_work(thread, tcomplete); binder_inner_proc_lock(target_proc); if (target_thread->is_dead) { return_error = BR_DEAD_REPLY; binder_inner_proc_unlock(target_proc); goto err_dead_proc_or_thread; } BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction_ilocked(target_thread, in_reply_to); binder_enqueue_thread_work_ilocked(target_thread, &t->work); target_proc->outstanding_txns++; binder_inner_proc_unlock(target_proc); wake_up_interruptible_sync(&target_thread->wait); binder_free_transaction(in_reply_to); } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); binder_inner_proc_lock(proc); /* * Defer the TRANSACTION_COMPLETE, so we don't return to * userspace immediately; this allows the target process to * immediately start processing this transaction, reducing * latency. We will then return the TRANSACTION_COMPLETE when * the target replies (or there is an error). */ binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete); t->need_reply = 1; t->from_parent = thread->transaction_stack; thread->transaction_stack = t; binder_inner_proc_unlock(proc); return_error = binder_proc_transaction(t, target_proc, target_thread); if (return_error) { binder_inner_proc_lock(proc); binder_pop_transaction_ilocked(thread, t); binder_inner_proc_unlock(proc); goto err_dead_proc_or_thread; } } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); return_error = binder_proc_transaction(t, target_proc, NULL); /* * Let the caller know when async transaction reaches a frozen * process and is put in a pending queue, waiting for the target * process to be unfrozen. */ if (return_error == BR_TRANSACTION_PENDING_FROZEN) tcomplete->type = BINDER_WORK_TRANSACTION_PENDING; binder_enqueue_thread_work(thread, tcomplete); if (return_error && return_error != BR_TRANSACTION_PENDING_FROZEN) goto err_dead_proc_or_thread; } if (target_thread) binder_thread_dec_tmpref(target_thread); binder_proc_dec_tmpref(target_proc); if (target_node) binder_dec_node_tmpref(target_node); /* * write barrier to synchronize with initialization * of log entry */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); return; err_dead_proc_or_thread: binder_txn_error("%d:%d dead process or thread\n", thread->pid, proc->pid); return_error_line = __LINE__; binder_dequeue_work(proc, tcomplete); err_translate_failed: err_bad_object_type: err_bad_offset: err_bad_parent: err_copy_data_failed: binder_cleanup_deferred_txn_lists(&sgc_head, &pf_head); binder_free_txn_fixups(t); trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, NULL, t->buffer, buffer_offset, true); if (target_node) binder_dec_node_tmpref(target_node); target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); err_binder_alloc_buf_failed: err_bad_extra_size: if (secctx) security_release_secctx(secctx, secctx_sz); err_get_secctx_failed: kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); err_alloc_tcomplete_failed: if (trace_binder_txn_latency_free_enabled()) binder_txn_latency_free(t); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: err_bad_todo_list: err_bad_call_stack: err_empty_call_stack: err_dead_binder: err_invalid_target_handle: if (target_node) { binder_dec_node(target_node, 1, 0); binder_dec_node_tmpref(target_node); } binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction %s to %d:%d failed %d/%d/%d, size %lld-%lld line %d\n", proc->pid, thread->pid, reply ? "reply" : (tr->flags & TF_ONE_WAY ? "async" : "call"), target_proc ? target_proc->pid : 0, target_thread ? target_thread->pid : 0, t_debug_id, return_error, return_error_param, (u64)tr->data_size, (u64)tr->offsets_size, return_error_line); if (target_thread) binder_thread_dec_tmpref(target_thread); if (target_proc) binder_proc_dec_tmpref(target_proc); { struct binder_transaction_log_entry *fe; e->return_error = return_error; e->return_error_param = return_error_param; e->return_error_line = return_error_line; fe = binder_transaction_log_add(&binder_transaction_log_failed); *fe = *e; /* * write barrier to synchronize with initialization * of log entry */ smp_wmb(); WRITE_ONCE(e->debug_id_done, t_debug_id); WRITE_ONCE(fe->debug_id_done, t_debug_id); } BUG_ON(thread->return_error.cmd != BR_OK); if (in_reply_to) { binder_set_txn_from_error(in_reply_to, t_debug_id, return_error, return_error_param); thread->return_error.cmd = BR_TRANSACTION_COMPLETE; binder_enqueue_thread_work(thread, &thread->return_error.work); binder_send_failed_reply(in_reply_to, return_error); } else { binder_inner_proc_lock(proc); binder_set_extended_error(&thread->ee, t_debug_id, return_error, return_error_param); binder_inner_proc_unlock(proc); thread->return_error.cmd = return_error; binder_enqueue_thread_work(thread, &thread->return_error.work); } } /** * binder_free_buf() - free the specified buffer * @proc: binder proc that owns buffer * @buffer: buffer to be freed * @is_failure: failed to send transaction * * If buffer for an async transaction, enqueue the next async * transaction from the node. * * Cleanup buffer and free it. */ static void binder_free_buf(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, bool is_failure) { binder_inner_proc_lock(proc); if (buffer->transaction) { buffer->transaction->buffer = NULL; buffer->transaction = NULL; } binder_inner_proc_unlock(proc); if (buffer->async_transaction && buffer->target_node) { struct binder_node *buf_node; struct binder_work *w; buf_node = buffer->target_node; binder_node_inner_lock(buf_node); BUG_ON(!buf_node->has_async_transaction); BUG_ON(buf_node->proc != proc); w = binder_dequeue_work_head_ilocked( &buf_node->async_todo); if (!w) { buf_node->has_async_transaction = false; } else { binder_enqueue_work_ilocked( w, &proc->todo); binder_wakeup_proc_ilocked(proc); } binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); binder_release_entire_buffer(proc, thread, buffer, is_failure); binder_alloc_free_buf(&proc->alloc, buffer); } static int binder_thread_write(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, binder_size_t *consumed) { uint32_t cmd; struct binder_context *context = proc->context; void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; while (ptr < end && thread->return_error.cmd == BR_OK) { int ret; if (get_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); trace_binder_command(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { atomic_inc(&binder_stats.bc[_IOC_NR(cmd)]); atomic_inc(&proc->stats.bc[_IOC_NR(cmd)]); atomic_inc(&thread->stats.bc[_IOC_NR(cmd)]); } switch (cmd) { case BC_INCREFS: case BC_ACQUIRE: case BC_RELEASE: case BC_DECREFS: { uint32_t target; const char *debug_string; bool strong = cmd == BC_ACQUIRE || cmd == BC_RELEASE; bool increment = cmd == BC_INCREFS || cmd == BC_ACQUIRE; struct binder_ref_data rdata; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); ret = -1; if (increment && !target) { struct binder_node *ctx_mgr_node; mutex_lock(&context->context_mgr_node_lock); ctx_mgr_node = context->binder_context_mgr_node; if (ctx_mgr_node) { if (ctx_mgr_node->proc == proc) { binder_user_error("%d:%d context manager tried to acquire desc 0\n", proc->pid, thread->pid); mutex_unlock(&context->context_mgr_node_lock); return -EINVAL; } ret = binder_inc_ref_for_node( proc, ctx_mgr_node, strong, NULL, &rdata); } mutex_unlock(&context->context_mgr_node_lock); } if (ret) ret = binder_update_ref_for_handle( proc, target, increment, strong, &rdata); if (!ret && rdata.desc != target) { binder_user_error("%d:%d tried to acquire reference to desc %d, got %d instead\n", proc->pid, thread->pid, target, rdata.desc); } switch (cmd) { case BC_INCREFS: debug_string = "IncRefs"; break; case BC_ACQUIRE: debug_string = "Acquire"; break; case BC_RELEASE: debug_string = "Release"; break; case BC_DECREFS: default: debug_string = "DecRefs"; break; } if (ret) { binder_user_error("%d:%d %s %d refcount change on invalid ref %d ret %d\n", proc->pid, thread->pid, debug_string, strong, target, ret); break; } binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s ref %d desc %d s %d w %d\n", proc->pid, thread->pid, debug_string, rdata.debug_id, rdata.desc, rdata.strong, rdata.weak); break; } case BC_INCREFS_DONE: case BC_ACQUIRE_DONE: { binder_uintptr_t node_ptr; binder_uintptr_t cookie; struct binder_node *node; bool free_node; if (get_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); node = binder_get_node(proc, node_ptr); if (node == NULL) { binder_user_error("%d:%d %s u%016llx no match\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr); break; } if (cookie != node->cookie) { binder_user_error("%d:%d %s u%016llx node %d cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", (u64)node_ptr, node->debug_id, (u64)cookie, (u64)node->cookie); binder_put_node(node); break; } binder_node_inner_lock(node); if (cmd == BC_ACQUIRE_DONE) { if (node->pending_strong_ref == 0) { binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", proc->pid, thread->pid, node->debug_id); binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_strong_ref = 0; } else { if (node->pending_weak_ref == 0) { binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", proc->pid, thread->pid, node->debug_id); binder_node_inner_unlock(node); binder_put_node(node); break; } node->pending_weak_ref = 0; } free_node = binder_dec_node_nilocked(node, cmd == BC_ACQUIRE_DONE, 0); WARN_ON(free_node); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s node %d ls %d lw %d tr %d\n", proc->pid, thread->pid, cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", node->debug_id, node->local_strong_refs, node->local_weak_refs, node->tmp_refs); binder_node_inner_unlock(node); binder_put_node(node); break; } case BC_ATTEMPT_ACQUIRE: pr_err("BC_ATTEMPT_ACQUIRE not supported\n"); return -EINVAL; case BC_ACQUIRE_RESULT: pr_err("BC_ACQUIRE_RESULT not supported\n"); return -EINVAL; case BC_FREE_BUFFER: { binder_uintptr_t data_ptr; struct binder_buffer *buffer; if (get_user(data_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); buffer = binder_alloc_prepare_to_free(&proc->alloc, data_ptr); if (IS_ERR_OR_NULL(buffer)) { if (PTR_ERR(buffer) == -EPERM) { binder_user_error( "%d:%d BC_FREE_BUFFER u%016llx matched unreturned or currently freeing buffer\n", proc->pid, thread->pid, (u64)data_ptr); } else { binder_user_error( "%d:%d BC_FREE_BUFFER u%016llx no match\n", proc->pid, thread->pid, (u64)data_ptr); } break; } binder_debug(BINDER_DEBUG_FREE_BUFFER, "%d:%d BC_FREE_BUFFER u%016llx found buffer %d for %s transaction\n", proc->pid, thread->pid, (u64)data_ptr, buffer->debug_id, buffer->transaction ? "active" : "finished"); binder_free_buf(proc, thread, buffer, false); break; } case BC_TRANSACTION_SG: case BC_REPLY_SG: { struct binder_transaction_data_sg tr; if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); binder_transaction(proc, thread, &tr.transaction_data, cmd == BC_REPLY_SG, tr.buffers_size); break; } case BC_TRANSACTION: case BC_REPLY: { struct binder_transaction_data tr; if (copy_from_user(&tr, ptr, sizeof(tr))) return -EFAULT; ptr += sizeof(tr); binder_transaction(proc, thread, &tr, cmd == BC_REPLY, 0); break; } case BC_REGISTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_REGISTER_LOOPER\n", proc->pid, thread->pid); binder_inner_proc_lock(proc); if (thread->looper & BINDER_LOOPER_STATE_ENTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n", proc->pid, thread->pid); } else if (proc->requested_threads == 0) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called without request\n", proc->pid, thread->pid); } else { proc->requested_threads--; proc->requested_threads_started++; } thread->looper |= BINDER_LOOPER_STATE_REGISTERED; binder_inner_proc_unlock(proc); break; case BC_ENTER_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_ENTER_LOOPER\n", proc->pid, thread->pid); if (thread->looper & BINDER_LOOPER_STATE_REGISTERED) { thread->looper |= BINDER_LOOPER_STATE_INVALID; binder_user_error("%d:%d ERROR: BC_ENTER_LOOPER called after BC_REGISTER_LOOPER\n", proc->pid, thread->pid); } thread->looper |= BINDER_LOOPER_STATE_ENTERED; break; case BC_EXIT_LOOPER: binder_debug(BINDER_DEBUG_THREADS, "%d:%d BC_EXIT_LOOPER\n", proc->pid, thread->pid); thread->looper |= BINDER_LOOPER_STATE_EXITED; break; case BC_REQUEST_DEATH_NOTIFICATION: case BC_CLEAR_DEATH_NOTIFICATION: { uint32_t target; binder_uintptr_t cookie; struct binder_ref *ref; struct binder_ref_death *death = NULL; if (get_user(target, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { /* * Allocate memory for death notification * before taking lock */ death = kzalloc(sizeof(*death), GFP_KERNEL); if (death == NULL) { WARN_ON(thread->return_error.cmd != BR_OK); thread->return_error.cmd = BR_ERROR; binder_enqueue_thread_work( thread, &thread->return_error.work); binder_debug( BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", proc->pid, thread->pid); break; } } binder_proc_lock(proc); ref = binder_get_ref_olocked(proc, target, false); if (ref == NULL) { binder_user_error("%d:%d %s invalid ref %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", target); binder_proc_unlock(proc); kfree(death); break; } binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx ref %d desc %d s %d w %d for node %d\n", proc->pid, thread->pid, cmd == BC_REQUEST_DEATH_NOTIFICATION ? "BC_REQUEST_DEATH_NOTIFICATION" : "BC_CLEAR_DEATH_NOTIFICATION", (u64)cookie, ref->data.debug_id, ref->data.desc, ref->data.strong, ref->data.weak, ref->node->debug_id); binder_node_lock(ref->node); if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { if (ref->death) { binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); kfree(death); break; } binder_stats_created(BINDER_STAT_DEATH); INIT_LIST_HEAD(&death->work.entry); death->cookie = cookie; ref->death = death; if (ref->node->proc == NULL) { ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_inner_proc_lock(proc); binder_enqueue_work_ilocked( &ref->death->work, &proc->todo); binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); } } else { if (ref->death == NULL) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", proc->pid, thread->pid); binder_node_unlock(ref->node); binder_proc_unlock(proc); break; } death = ref->death; if (death->cookie != cookie) { binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch %016llx != %016llx\n", proc->pid, thread->pid, (u64)death->cookie, (u64)cookie); binder_node_unlock(ref->node); binder_proc_unlock(proc); break; } ref->death = NULL; binder_inner_proc_lock(proc); if (list_empty(&death->work.entry)) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) binder_enqueue_thread_work_ilocked( thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, &proc->todo); binder_wakeup_proc_ilocked( proc); } } else { BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; } binder_inner_proc_unlock(proc); } binder_node_unlock(ref->node); binder_proc_unlock(proc); } break; case BC_DEAD_BINDER_DONE: { struct binder_work *w; binder_uintptr_t cookie; struct binder_ref_death *death = NULL; if (get_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(cookie); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->delivered_death, entry) { struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work); if (tmp_death->cookie == cookie) { death = tmp_death; break; } } binder_debug(BINDER_DEBUG_DEAD_BINDER, "%d:%d BC_DEAD_BINDER_DONE %016llx found %pK\n", proc->pid, thread->pid, (u64)cookie, death); if (death == NULL) { binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n", proc->pid, thread->pid, (u64)cookie); binder_inner_proc_unlock(proc); break; } binder_dequeue_work_ilocked(&death->work); if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) { death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) binder_enqueue_thread_work_ilocked( thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, &proc->todo); binder_wakeup_proc_ilocked(proc); } } binder_inner_proc_unlock(proc); } break; default: pr_err("%d:%d unknown command %u\n", proc->pid, thread->pid, cmd); return -EINVAL; } *consumed = ptr - buffer; } return 0; } static void binder_stat_br(struct binder_proc *proc, struct binder_thread *thread, uint32_t cmd) { trace_binder_return(cmd); if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { atomic_inc(&binder_stats.br[_IOC_NR(cmd)]); atomic_inc(&proc->stats.br[_IOC_NR(cmd)]); atomic_inc(&thread->stats.br[_IOC_NR(cmd)]); } } static int binder_put_node_cmd(struct binder_proc *proc, struct binder_thread *thread, void __user **ptrp, binder_uintptr_t node_ptr, binder_uintptr_t node_cookie, int node_debug_id, uint32_t cmd, const char *cmd_name) { void __user *ptr = *ptrp; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(node_ptr, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); if (put_user(node_cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_USER_REFS, "%d:%d %s %d u%016llx c%016llx\n", proc->pid, thread->pid, cmd_name, node_debug_id, (u64)node_ptr, (u64)node_cookie); *ptrp = ptr; return 0; } static int binder_wait_for_work(struct binder_thread *thread, bool do_proc_work) { DEFINE_WAIT(wait); struct binder_proc *proc = thread->proc; int ret = 0; binder_inner_proc_lock(proc); for (;;) { prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE); if (binder_has_work_ilocked(thread, do_proc_work)) break; if (do_proc_work) list_add(&thread->waiting_thread_node, &proc->waiting_threads); binder_inner_proc_unlock(proc); schedule(); binder_inner_proc_lock(proc); list_del_init(&thread->waiting_thread_node); if (signal_pending(current)) { ret = -EINTR; break; } } finish_wait(&thread->wait, &wait); binder_inner_proc_unlock(proc); return ret; } /** * binder_apply_fd_fixups() - finish fd translation * @proc: binder_proc associated @t->buffer * @t: binder transaction with list of fd fixups * * Now that we are in the context of the transaction target * process, we can allocate and install fds. Process the * list of fds to translate and fixup the buffer with the * new fds first and only then install the files. * * If we fail to allocate an fd, skip the install and release * any fds that have already been allocated. */ static int binder_apply_fd_fixups(struct binder_proc *proc, struct binder_transaction *t) { struct binder_txn_fd_fixup *fixup, *tmp; int ret = 0; list_for_each_entry(fixup, &t->fd_fixups, fixup_entry) { int fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { binder_debug(BINDER_DEBUG_TRANSACTION, "failed fd fixup txn %d fd %d\n", t->debug_id, fd); ret = -ENOMEM; goto err; } binder_debug(BINDER_DEBUG_TRANSACTION, "fd fixup txn %d fd %d\n", t->debug_id, fd); trace_binder_transaction_fd_recv(t, fd, fixup->offset); fixup->target_fd = fd; if (binder_alloc_copy_to_buffer(&proc->alloc, t->buffer, fixup->offset, &fd, sizeof(u32))) { ret = -EINVAL; goto err; } } list_for_each_entry_safe(fixup, tmp, &t->fd_fixups, fixup_entry) { fd_install(fixup->target_fd, fixup->file); list_del(&fixup->fixup_entry); kfree(fixup); } return ret; err: binder_free_txn_fixups(t); return ret; } static int binder_thread_read(struct binder_proc *proc, struct binder_thread *thread, binder_uintptr_t binder_buffer, size_t size, binder_size_t *consumed, int non_block) { void __user *buffer = (void __user *)(uintptr_t)binder_buffer; void __user *ptr = buffer + *consumed; void __user *end = buffer + size; int ret = 0; int wait_for_proc_work; if (*consumed == 0) { if (put_user(BR_NOOP, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); } retry: binder_inner_proc_lock(proc); wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); binder_inner_proc_unlock(proc); thread->looper |= BINDER_LOOPER_STATE_WAITING; trace_binder_wait_for_work(wait_for_proc_work, !!thread->transaction_stack, !binder_worklist_empty(proc, &thread->todo)); if (wait_for_proc_work) { if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED))) { binder_user_error("%d:%d ERROR: Thread waiting for process work before calling BC_REGISTER_LOOPER or BC_ENTER_LOOPER (state %x)\n", proc->pid, thread->pid, thread->looper); wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); } binder_set_nice(proc->default_priority); } if (non_block) { if (!binder_has_work(thread, wait_for_proc_work)) ret = -EAGAIN; } else { ret = binder_wait_for_work(thread, wait_for_proc_work); } thread->looper &= ~BINDER_LOOPER_STATE_WAITING; if (ret) return ret; while (1) { uint32_t cmd; struct binder_transaction_data_secctx tr; struct binder_transaction_data *trd = &tr.transaction_data; struct binder_work *w = NULL; struct list_head *list = NULL; struct binder_transaction *t = NULL; struct binder_thread *t_from; size_t trsize = sizeof(*trd); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&thread->todo)) list = &thread->todo; else if (!binder_worklist_empty_ilocked(&proc->todo) && wait_for_proc_work) list = &proc->todo; else { binder_inner_proc_unlock(proc); /* no data added */ if (ptr - buffer == 4 && !thread->looper_need_return) goto retry; break; } if (end - ptr < sizeof(tr) + 4) { binder_inner_proc_unlock(proc); break; } w = binder_dequeue_work_head_ilocked(list); if (binder_worklist_empty_ilocked(&thread->todo)) thread->process_todo = false; switch (w->type) { case BINDER_WORK_TRANSACTION: { binder_inner_proc_unlock(proc); t = container_of(w, struct binder_transaction, work); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); WARN_ON(e->cmd == BR_OK); binder_inner_proc_unlock(proc); if (put_user(e->cmd, (uint32_t __user *)ptr)) return -EFAULT; cmd = e->cmd; e->cmd = BR_OK; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); } break; case BINDER_WORK_TRANSACTION_COMPLETE: case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: { if (proc->oneway_spam_detection_enabled && w->type == BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT) cmd = BR_ONEWAY_SPAM_SUSPECT; else if (w->type == BINDER_WORK_TRANSACTION_PENDING) cmd = BR_TRANSACTION_PENDING_FROZEN; else cmd = BR_TRANSACTION_COMPLETE; binder_inner_proc_unlock(proc); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, "%d:%d BR_TRANSACTION_COMPLETE\n", proc->pid, thread->pid); } break; case BINDER_WORK_NODE: { struct binder_node *node = container_of(w, struct binder_node, work); int strong, weak; binder_uintptr_t node_ptr = node->ptr; binder_uintptr_t node_cookie = node->cookie; int node_debug_id = node->debug_id; int has_weak_ref; int has_strong_ref; void __user *orig_ptr = ptr; BUG_ON(proc != node->proc); strong = node->internal_strong_refs || node->local_strong_refs; weak = !hlist_empty(&node->refs) || node->local_weak_refs || node->tmp_refs || strong; has_strong_ref = node->has_strong_ref; has_weak_ref = node->has_weak_ref; if (weak && !has_weak_ref) { node->has_weak_ref = 1; node->pending_weak_ref = 1; node->local_weak_refs++; } if (strong && !has_strong_ref) { node->has_strong_ref = 1; node->pending_strong_ref = 1; node->local_strong_refs++; } if (!strong && has_strong_ref) node->has_strong_ref = 0; if (!weak && has_weak_ref) node->has_weak_ref = 0; if (!weak && !strong) { binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx deleted\n", proc->pid, thread->pid, node_debug_id, (u64)node_ptr, (u64)node_cookie); rb_erase(&node->rb_node, &proc->nodes); binder_inner_proc_unlock(proc); binder_node_lock(node); /* * Acquire the node lock before freeing the * node to serialize with other threads that * may have been holding the node lock while * decrementing this node (avoids race where * this thread frees while the other thread * is unlocking the node after the final * decrement) */ binder_node_unlock(node); binder_free_node(node); } else binder_inner_proc_unlock(proc); if (weak && !has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_INCREFS, "BR_INCREFS"); if (!ret && strong && !has_strong_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_ACQUIRE, "BR_ACQUIRE"); if (!ret && !strong && has_strong_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_RELEASE, "BR_RELEASE"); if (!ret && !weak && has_weak_ref) ret = binder_put_node_cmd( proc, thread, &ptr, node_ptr, node_cookie, node_debug_id, BR_DECREFS, "BR_DECREFS"); if (orig_ptr == ptr) binder_debug(BINDER_DEBUG_INTERNAL_REFS, "%d:%d node %d u%016llx c%016llx state unchanged\n", proc->pid, thread->pid, node_debug_id, (u64)node_ptr, (u64)node_cookie); if (ret) return ret; } break; case BINDER_WORK_DEAD_BINDER: case BINDER_WORK_DEAD_BINDER_AND_CLEAR: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; uint32_t cmd; binder_uintptr_t cookie; death = container_of(w, struct binder_ref_death, work); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE; else cmd = BR_DEAD_BINDER; cookie = death->cookie; binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, "%d:%d %s %016llx\n", proc->pid, thread->pid, cmd == BR_DEAD_BINDER ? "BR_DEAD_BINDER" : "BR_CLEAR_DEATH_NOTIFICATION_DONE", (u64)cookie); if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { binder_inner_proc_unlock(proc); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } else { binder_enqueue_work_ilocked( w, &proc->delivered_death); binder_inner_proc_unlock(proc); } if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); if (put_user(cookie, (binder_uintptr_t __user *)ptr)) return -EFAULT; ptr += sizeof(binder_uintptr_t); binder_stat_br(proc, thread, cmd); if (cmd == BR_DEAD_BINDER) goto done; /* DEAD_BINDER notifications can cause transactions */ } break; default: binder_inner_proc_unlock(proc); pr_err("%d:%d: bad work type %d\n", proc->pid, thread->pid, w->type); break; } if (!t) continue; BUG_ON(t->buffer == NULL); if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; trd->target.ptr = target_node->ptr; trd->cookie = target_node->cookie; t->saved_priority = task_nice(current); if (t->priority < target_node->min_priority && !(t->flags & TF_ONE_WAY)) binder_set_nice(t->priority); else if (!(t->flags & TF_ONE_WAY) || t->saved_priority > target_node->min_priority) binder_set_nice(target_node->min_priority); cmd = BR_TRANSACTION; } else { trd->target.ptr = 0; trd->cookie = 0; cmd = BR_REPLY; } trd->code = t->code; trd->flags = t->flags; trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid); t_from = binder_get_txn_from(t); if (t_from) { struct task_struct *sender = t_from->proc->tsk; trd->sender_pid = task_tgid_nr_ns(sender, task_active_pid_ns(current)); } else { trd->sender_pid = 0; } ret = binder_apply_fd_fixups(proc, t); if (ret) { struct binder_buffer *buffer = t->buffer; bool oneway = !!(t->flags & TF_ONE_WAY); int tid = t->debug_id; if (t_from) binder_thread_dec_tmpref(t_from); buffer->transaction = NULL; binder_cleanup_transaction(t, "fd fixups failed", BR_FAILED_REPLY); binder_free_buf(proc, thread, buffer, true); binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d %stransaction %d fd fixups failed %d/%d, line %d\n", proc->pid, thread->pid, oneway ? "async " : (cmd == BR_REPLY ? "reply " : ""), tid, BR_FAILED_REPLY, ret, __LINE__); if (cmd == BR_REPLY) { cmd = BR_FAILED_REPLY; if (put_user(cmd, (uint32_t __user *)ptr)) return -EFAULT; ptr += sizeof(uint32_t); binder_stat_br(proc, thread, cmd); break; } continue; } trd->data_size = t->buffer->data_size; trd->offsets_size = t->buffer->offsets_size; trd->data.ptr.buffer = t->buffer->user_data; trd->data.ptr.offsets = trd->data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); tr.secctx = t->security_ctx; if (t->security_ctx) { cmd = BR_TRANSACTION_SEC_CTX; trsize = sizeof(tr); } if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); binder_cleanup_transaction(t, "put_user failed", BR_FAILED_REPLY); return -EFAULT; } ptr += sizeof(uint32_t); if (copy_to_user(ptr, &tr, trsize)) { if (t_from) binder_thread_dec_tmpref(t_from); binder_cleanup_transaction(t, "copy_to_user failed", BR_FAILED_REPLY); return -EFAULT; } ptr += trsize; trace_binder_transaction_received(t); binder_stat_br(proc, thread, cmd); binder_debug(BINDER_DEBUG_TRANSACTION, "%d:%d %s %d %d:%d, cmd %u size %zd-%zd ptr %016llx-%016llx\n", proc->pid, thread->pid, (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : (cmd == BR_TRANSACTION_SEC_CTX) ? "BR_TRANSACTION_SEC_CTX" : "BR_REPLY", t->debug_id, t_from ? t_from->proc->pid : 0, t_from ? t_from->pid : 0, cmd, t->buffer->data_size, t->buffer->offsets_size, (u64)trd->data.ptr.buffer, (u64)trd->data.ptr.offsets); if (t_from) binder_thread_dec_tmpref(t_from); t->buffer->allow_user_free = 1; if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) { binder_inner_proc_lock(thread->proc); t->to_parent = thread->transaction_stack; t->to_thread = thread; thread->transaction_stack = t; binder_inner_proc_unlock(thread->proc); } else { binder_free_transaction(t); } break; } done: *consumed = ptr - buffer; binder_inner_proc_lock(proc); if (proc->requested_threads == 0 && list_empty(&thread->proc->waiting_threads) && proc->requested_threads_started < proc->max_threads && (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */ /*spawn a new thread if we leave this out */) { proc->requested_threads++; binder_inner_proc_unlock(proc); binder_debug(BINDER_DEBUG_THREADS, "%d:%d BR_SPAWN_LOOPER\n", proc->pid, thread->pid); if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer)) return -EFAULT; binder_stat_br(proc, thread, BR_SPAWN_LOOPER); } else binder_inner_proc_unlock(proc); return 0; } static void binder_release_work(struct binder_proc *proc, struct list_head *list) { struct binder_work *w; enum binder_work_type wtype; while (1) { binder_inner_proc_lock(proc); w = binder_dequeue_work_head_ilocked(list); wtype = w ? w->type : 0; binder_inner_proc_unlock(proc); if (!w) return; switch (wtype) { case BINDER_WORK_TRANSACTION: { struct binder_transaction *t; t = container_of(w, struct binder_transaction, work); binder_cleanup_transaction(t, "process died.", BR_DEAD_REPLY); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_ERROR: %u\n", e->cmd); } break; case BINDER_WORK_TRANSACTION_PENDING: case BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT: case BINDER_WORK_TRANSACTION_COMPLETE: { binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered TRANSACTION_COMPLETE\n"); kfree(w); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); } break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { struct binder_ref_death *death; death = container_of(w, struct binder_ref_death, work); binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "undelivered death notification, %016llx\n", (u64)death->cookie); kfree(death); binder_stats_deleted(BINDER_STAT_DEATH); } break; case BINDER_WORK_NODE: break; default: pr_err("unexpected work type, %d, not freed\n", wtype); break; } } } static struct binder_thread *binder_get_thread_ilocked( struct binder_proc *proc, struct binder_thread *new_thread) { struct binder_thread *thread = NULL; struct rb_node *parent = NULL; struct rb_node **p = &proc->threads.rb_node; while (*p) { parent = *p; thread = rb_entry(parent, struct binder_thread, rb_node); if (current->pid < thread->pid) p = &(*p)->rb_left; else if (current->pid > thread->pid) p = &(*p)->rb_right; else return thread; } if (!new_thread) return NULL; thread = new_thread; binder_stats_created(BINDER_STAT_THREAD); thread->proc = proc; thread->pid = current->pid; atomic_set(&thread->tmp_ref, 0); init_waitqueue_head(&thread->wait); INIT_LIST_HEAD(&thread->todo); rb_link_node(&thread->rb_node, parent, p); rb_insert_color(&thread->rb_node, &proc->threads); thread->looper_need_return = true; thread->return_error.work.type = BINDER_WORK_RETURN_ERROR; thread->return_error.cmd = BR_OK; thread->reply_error.work.type = BINDER_WORK_RETURN_ERROR; thread->reply_error.cmd = BR_OK; thread->ee.command = BR_OK; INIT_LIST_HEAD(&new_thread->waiting_thread_node); return thread; } static struct binder_thread *binder_get_thread(struct binder_proc *proc) { struct binder_thread *thread; struct binder_thread *new_thread; binder_inner_proc_lock(proc); thread = binder_get_thread_ilocked(proc, NULL); binder_inner_proc_unlock(proc); if (!thread) { new_thread = kzalloc(sizeof(*thread), GFP_KERNEL); if (new_thread == NULL) return NULL; binder_inner_proc_lock(proc); thread = binder_get_thread_ilocked(proc, new_thread); binder_inner_proc_unlock(proc); if (thread != new_thread) kfree(new_thread); } return thread; } static void binder_free_proc(struct binder_proc *proc) { struct binder_device *device; BUG_ON(!list_empty(&proc->todo)); BUG_ON(!list_empty(&proc->delivered_death)); if (proc->outstanding_txns) pr_warn("%s: Unexpected outstanding_txns %d\n", __func__, proc->outstanding_txns); device = container_of(proc->context, struct binder_device, context); if (refcount_dec_and_test(&device->ref)) { kfree(proc->context->name); kfree(device); } binder_alloc_deferred_release(&proc->alloc); put_task_struct(proc->tsk); put_cred(proc->cred); binder_stats_deleted(BINDER_STAT_PROC); dbitmap_free(&proc->dmap); kfree(proc); } static void binder_free_thread(struct binder_thread *thread) { BUG_ON(!list_empty(&thread->todo)); binder_stats_deleted(BINDER_STAT_THREAD); binder_proc_dec_tmpref(thread->proc); kfree(thread); } static int binder_thread_release(struct binder_proc *proc, struct binder_thread *thread) { struct binder_transaction *t; struct binder_transaction *send_reply = NULL; int active_transactions = 0; struct binder_transaction *last_t = NULL; binder_inner_proc_lock(thread->proc); /* * take a ref on the proc so it survives * after we remove this thread from proc->threads. * The corresponding dec is when we actually * free the thread in binder_free_thread() */ proc->tmp_ref++; /* * take a ref on this thread to ensure it * survives while we are releasing it */ atomic_inc(&thread->tmp_ref); rb_erase(&thread->rb_node, &proc->threads); t = thread->transaction_stack; if (t) { spin_lock(&t->lock); if (t->to_thread == thread) send_reply = t; } else { __acquire(&t->lock); } thread->is_dead = true; while (t) { last_t = t; active_transactions++; binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, "release %d:%d transaction %d %s, still active\n", proc->pid, thread->pid, t->debug_id, (t->to_thread == thread) ? "in" : "out"); if (t->to_thread == thread) { thread->proc->outstanding_txns--; t->to_proc = NULL; t->to_thread = NULL; if (t->buffer) { t->buffer->transaction = NULL; t->buffer = NULL; } t = t->to_parent; } else if (t->from == thread) { t->from = NULL; t = t->from_parent; } else BUG(); spin_unlock(&last_t->lock); if (t) spin_lock(&t->lock); else __acquire(&t->lock); } /* annotation for sparse, lock not acquired in last iteration above */ __release(&t->lock); /* * If this thread used poll, make sure we remove the waitqueue from any * poll data structures holding it. */ if (thread->looper & BINDER_LOOPER_STATE_POLL) wake_up_pollfree(&thread->wait); binder_inner_proc_unlock(thread->proc); /* * This is needed to avoid races between wake_up_pollfree() above and * someone else removing the last entry from the queue for other reasons * (e.g. ep_remove_wait_queue() being called due to an epoll file * descriptor being closed). Such other users hold an RCU read lock, so * we can be sure they're done after we call synchronize_rcu(). */ if (thread->looper & BINDER_LOOPER_STATE_POLL) synchronize_rcu(); if (send_reply) binder_send_failed_reply(send_reply, BR_DEAD_REPLY); binder_release_work(proc, &thread->todo); binder_thread_dec_tmpref(thread); return active_transactions; } static __poll_t binder_poll(struct file *filp, struct poll_table_struct *wait) { struct binder_proc *proc = filp->private_data; struct binder_thread *thread = NULL; bool wait_for_proc_work; thread = binder_get_thread(proc); if (!thread) return EPOLLERR; binder_inner_proc_lock(thread->proc); thread->looper |= BINDER_LOOPER_STATE_POLL; wait_for_proc_work = binder_available_for_proc_work_ilocked(thread); binder_inner_proc_unlock(thread->proc); poll_wait(filp, &thread->wait, wait); if (binder_has_work(thread, wait_for_proc_work)) return EPOLLIN; return 0; } static int binder_ioctl_write_read(struct file *filp, unsigned long arg, struct binder_thread *thread) { int ret = 0; struct binder_proc *proc = filp->private_data; void __user *ubuf = (void __user *)arg; struct binder_write_read bwr; if (copy_from_user(&bwr, ubuf, sizeof(bwr))) { ret = -EFAULT; goto out; } binder_debug(BINDER_DEBUG_READ_WRITE, "%d:%d write %lld at %016llx, read %lld at %016llx\n", proc->pid, thread->pid, (u64)bwr.write_size, (u64)bwr.write_buffer, (u64)bwr.read_size, (u64)bwr.read_buffer); if (bwr.write_size > 0) { ret = binder_thread_write(proc, thread, bwr.write_buffer, bwr.write_size, &bwr.write_consumed); trace_binder_write_done(ret); if (ret < 0) { bwr.read_consumed = 0; if (copy_to_user(ubuf, &bwr, sizeof(bwr))) ret = -EFAULT; goto out; } } if (bwr.read_size > 0) { ret = binder_thread_read(proc, thread, bwr.read_buffer, bwr.read_size, &bwr.read_consumed, filp->f_flags & O_NONBLOCK); trace_binder_read_done(ret); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&proc->todo)) binder_wakeup_proc_ilocked(proc); binder_inner_proc_unlock(proc); if (ret < 0) { if (copy_to_user(ubuf, &bwr, sizeof(bwr))) ret = -EFAULT; goto out; } } binder_debug(BINDER_DEBUG_READ_WRITE, "%d:%d wrote %lld of %lld, read return %lld of %lld\n", proc->pid, thread->pid, (u64)bwr.write_consumed, (u64)bwr.write_size, (u64)bwr.read_consumed, (u64)bwr.read_size); if (copy_to_user(ubuf, &bwr, sizeof(bwr))) { ret = -EFAULT; goto out; } out: return ret; } static int binder_ioctl_set_ctx_mgr(struct file *filp, struct flat_binder_object *fbo) { int ret = 0; struct binder_proc *proc = filp->private_data; struct binder_context *context = proc->context; struct binder_node *new_node; kuid_t curr_euid = current_euid(); mutex_lock(&context->context_mgr_node_lock); if (context->binder_context_mgr_node) { pr_err("BINDER_SET_CONTEXT_MGR already set\n"); ret = -EBUSY; goto out; } ret = security_binder_set_context_mgr(proc->cred); if (ret < 0) goto out; if (uid_valid(context->binder_context_mgr_uid)) { if (!uid_eq(context->binder_context_mgr_uid, curr_euid)) { pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", from_kuid(&init_user_ns, curr_euid), from_kuid(&init_user_ns, context->binder_context_mgr_uid)); ret = -EPERM; goto out; } } else { context->binder_context_mgr_uid = curr_euid; } new_node = binder_new_node(proc, fbo); if (!new_node) { ret = -ENOMEM; goto out; } binder_node_lock(new_node); new_node->local_weak_refs++; new_node->local_strong_refs++; new_node->has_strong_ref = 1; new_node->has_weak_ref = 1; context->binder_context_mgr_node = new_node; binder_node_unlock(new_node); binder_put_node(new_node); out: mutex_unlock(&context->context_mgr_node_lock); return ret; } static int binder_ioctl_get_node_info_for_ref(struct binder_proc *proc, struct binder_node_info_for_ref *info) { struct binder_node *node; struct binder_context *context = proc->context; __u32 handle = info->handle; if (info->strong_count || info->weak_count || info->reserved1 || info->reserved2 || info->reserved3) { binder_user_error("%d BINDER_GET_NODE_INFO_FOR_REF: only handle may be non-zero.", proc->pid); return -EINVAL; } /* This ioctl may only be used by the context manager */ mutex_lock(&context->context_mgr_node_lock); if (!context->binder_context_mgr_node || context->binder_context_mgr_node->proc != proc) { mutex_unlock(&context->context_mgr_node_lock); return -EPERM; } mutex_unlock(&context->context_mgr_node_lock); node = binder_get_node_from_ref(proc, handle, true, NULL); if (!node) return -EINVAL; info->strong_count = node->local_strong_refs + node->internal_strong_refs; info->weak_count = node->local_weak_refs; binder_put_node(node); return 0; } static int binder_ioctl_get_node_debug_info(struct binder_proc *proc, struct binder_node_debug_info *info) { struct rb_node *n; binder_uintptr_t ptr = info->ptr; memset(info, 0, sizeof(*info)); binder_inner_proc_lock(proc); for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); if (node->ptr > ptr) { info->ptr = node->ptr; info->cookie = node->cookie; info->has_strong_ref = node->has_strong_ref; info->has_weak_ref = node->has_weak_ref; break; } } binder_inner_proc_unlock(proc); return 0; } static bool binder_txns_pending_ilocked(struct binder_proc *proc) { struct rb_node *n; struct binder_thread *thread; if (proc->outstanding_txns > 0) return true; for (n = rb_first(&proc->threads); n; n = rb_next(n)) { thread = rb_entry(n, struct binder_thread, rb_node); if (thread->transaction_stack) return true; } return false; } static int binder_ioctl_freeze(struct binder_freeze_info *info, struct binder_proc *target_proc) { int ret = 0; if (!info->enable) { binder_inner_proc_lock(target_proc); target_proc->sync_recv = false; target_proc->async_recv = false; target_proc->is_frozen = false; binder_inner_proc_unlock(target_proc); return 0; } /* * Freezing the target. Prevent new transactions by * setting frozen state. If timeout specified, wait * for transactions to drain. */ binder_inner_proc_lock(target_proc); target_proc->sync_recv = false; target_proc->async_recv = false; target_proc->is_frozen = true; binder_inner_proc_unlock(target_proc); if (info->timeout_ms > 0) ret = wait_event_interruptible_timeout( target_proc->freeze_wait, (!target_proc->outstanding_txns), msecs_to_jiffies(info->timeout_ms)); /* Check pending transactions that wait for reply */ if (ret >= 0) { binder_inner_proc_lock(target_proc); if (binder_txns_pending_ilocked(target_proc)) ret = -EAGAIN; binder_inner_proc_unlock(target_proc); } if (ret < 0) { binder_inner_proc_lock(target_proc); target_proc->is_frozen = false; binder_inner_proc_unlock(target_proc); } return ret; } static int binder_ioctl_get_freezer_info( struct binder_frozen_status_info *info) { struct binder_proc *target_proc; bool found = false; __u32 txns_pending; info->sync_recv = 0; info->async_recv = 0; mutex_lock(&binder_procs_lock); hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid == info->pid) { found = true; binder_inner_proc_lock(target_proc); txns_pending = binder_txns_pending_ilocked(target_proc); info->sync_recv |= target_proc->sync_recv | (txns_pending << 1); info->async_recv |= target_proc->async_recv; binder_inner_proc_unlock(target_proc); } } mutex_unlock(&binder_procs_lock); if (!found) return -EINVAL; return 0; } static int binder_ioctl_get_extended_error(struct binder_thread *thread, void __user *ubuf) { struct binder_extended_error ee; binder_inner_proc_lock(thread->proc); ee = thread->ee; binder_set_extended_error(&thread->ee, 0, BR_OK, 0); binder_inner_proc_unlock(thread->proc); if (copy_to_user(ubuf, &ee, sizeof(ee))) return -EFAULT; return 0; } static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int ret; struct binder_proc *proc = filp->private_data; struct binder_thread *thread; void __user *ubuf = (void __user *)arg; /*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/ binder_selftest_alloc(&proc->alloc); trace_binder_ioctl(cmd, arg); ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret) goto err_unlocked; thread = binder_get_thread(proc); if (thread == NULL) { ret = -ENOMEM; goto err; } switch (cmd) { case BINDER_WRITE_READ: ret = binder_ioctl_write_read(filp, arg, thread); if (ret) goto err; break; case BINDER_SET_MAX_THREADS: { u32 max_threads; if (copy_from_user(&max_threads, ubuf, sizeof(max_threads))) { ret = -EINVAL; goto err; } binder_inner_proc_lock(proc); proc->max_threads = max_threads; binder_inner_proc_unlock(proc); break; } case BINDER_SET_CONTEXT_MGR_EXT: { struct flat_binder_object fbo; if (copy_from_user(&fbo, ubuf, sizeof(fbo))) { ret = -EINVAL; goto err; } ret = binder_ioctl_set_ctx_mgr(filp, &fbo); if (ret) goto err; break; } case BINDER_SET_CONTEXT_MGR: ret = binder_ioctl_set_ctx_mgr(filp, NULL); if (ret) goto err; break; case BINDER_THREAD_EXIT: binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n", proc->pid, thread->pid); binder_thread_release(proc, thread); thread = NULL; break; case BINDER_VERSION: { struct binder_version __user *ver = ubuf; if (put_user(BINDER_CURRENT_PROTOCOL_VERSION, &ver->protocol_version)) { ret = -EINVAL; goto err; } break; } case BINDER_GET_NODE_INFO_FOR_REF: { struct binder_node_info_for_ref info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_node_info_for_ref(proc, &info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_GET_NODE_DEBUG_INFO: { struct binder_node_debug_info info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_node_debug_info(proc, &info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_FREEZE: { struct binder_freeze_info info; struct binder_proc **target_procs = NULL, *target_proc; int target_procs_count = 0, i = 0; ret = 0; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } mutex_lock(&binder_procs_lock); hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid == info.pid) target_procs_count++; } if (target_procs_count == 0) { mutex_unlock(&binder_procs_lock); ret = -EINVAL; goto err; } target_procs = kcalloc(target_procs_count, sizeof(struct binder_proc *), GFP_KERNEL); if (!target_procs) { mutex_unlock(&binder_procs_lock); ret = -ENOMEM; goto err; } hlist_for_each_entry(target_proc, &binder_procs, proc_node) { if (target_proc->pid != info.pid) continue; binder_inner_proc_lock(target_proc); target_proc->tmp_ref++; binder_inner_proc_unlock(target_proc); target_procs[i++] = target_proc; } mutex_unlock(&binder_procs_lock); for (i = 0; i < target_procs_count; i++) { if (ret >= 0) ret = binder_ioctl_freeze(&info, target_procs[i]); binder_proc_dec_tmpref(target_procs[i]); } kfree(target_procs); if (ret < 0) goto err; break; } case BINDER_GET_FROZEN_INFO: { struct binder_frozen_status_info info; if (copy_from_user(&info, ubuf, sizeof(info))) { ret = -EFAULT; goto err; } ret = binder_ioctl_get_freezer_info(&info); if (ret < 0) goto err; if (copy_to_user(ubuf, &info, sizeof(info))) { ret = -EFAULT; goto err; } break; } case BINDER_ENABLE_ONEWAY_SPAM_DETECTION: { uint32_t enable; if (copy_from_user(&enable, ubuf, sizeof(enable))) { ret = -EFAULT; goto err; } binder_inner_proc_lock(proc); proc->oneway_spam_detection_enabled = (bool)enable; binder_inner_proc_unlock(proc); break; } case BINDER_GET_EXTENDED_ERROR: ret = binder_ioctl_get_extended_error(thread, ubuf); if (ret < 0) goto err; break; default: ret = -EINVAL; goto err; } ret = 0; err: if (thread) thread->looper_need_return = false; wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); if (ret && ret != -EINTR) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); err_unlocked: trace_binder_ioctl_done(ret); return ret; } static void binder_vma_open(struct vm_area_struct *vma) { struct binder_proc *proc = vma->vm_private_data; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); } static void binder_vma_close(struct vm_area_struct *vma) { struct binder_proc *proc = vma->vm_private_data; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); binder_alloc_vma_close(&proc->alloc); } static vm_fault_t binder_vm_fault(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } static const struct vm_operations_struct binder_vm_ops = { .open = binder_vma_open, .close = binder_vma_close, .fault = binder_vm_fault, }; static int binder_mmap(struct file *filp, struct vm_area_struct *vma) { struct binder_proc *proc = filp->private_data; if (proc->tsk != current->group_leader) return -EINVAL; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", __func__, proc->pid, vma->vm_start, vma->vm_end, (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, (unsigned long)pgprot_val(vma->vm_page_prot)); if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { pr_err("%s: %d %lx-%lx %s failed %d\n", __func__, proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); return -EPERM; } vm_flags_mod(vma, VM_DONTCOPY | VM_MIXEDMAP, VM_MAYWRITE); vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; return binder_alloc_mmap_handler(&proc->alloc, vma); } static int binder_open(struct inode *nodp, struct file *filp) { struct binder_proc *proc, *itr; struct binder_device *binder_dev; struct binderfs_info *info; struct dentry *binder_binderfs_dir_entry_proc = NULL; bool existing_pid = false; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__, current->group_leader->pid, current->pid); proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) return -ENOMEM; dbitmap_init(&proc->dmap); spin_lock_init(&proc->inner_lock); spin_lock_init(&proc->outer_lock); get_task_struct(current->group_leader); proc->tsk = current->group_leader; proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->freeze_wait); proc->default_priority = task_nice(current); /* binderfs stashes devices in i_private */ if (is_binderfs_device(nodp)) { binder_dev = nodp->i_private; info = nodp->i_sb->s_fs_info; binder_binderfs_dir_entry_proc = info->proc_log_dir; } else { binder_dev = container_of(filp->private_data, struct binder_device, miscdev); } refcount_inc(&binder_dev->ref); proc->context = &binder_dev->context; binder_alloc_init(&proc->alloc); binder_stats_created(BINDER_STAT_PROC); proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); INIT_LIST_HEAD(&proc->waiting_threads); filp->private_data = proc; mutex_lock(&binder_procs_lock); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == proc->pid) { existing_pid = true; break; } } hlist_add_head(&proc->proc_node, &binder_procs); mutex_unlock(&binder_procs_lock); if (binder_debugfs_dir_entry_proc && !existing_pid) { char strbuf[11]; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); /* * proc debug entries are shared between contexts. * Only create for the first PID to avoid debugfs log spamming * The printing code will anyway print all contexts for a given * PID so this is not a problem. */ proc->debugfs_entry = debugfs_create_file(strbuf, 0444, binder_debugfs_dir_entry_proc, (void *)(unsigned long)proc->pid, &proc_fops); } if (binder_binderfs_dir_entry_proc && !existing_pid) { char strbuf[11]; struct dentry *binderfs_entry; snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); /* * Similar to debugfs, the process specific log file is shared * between contexts. Only create for the first PID. * This is ok since same as debugfs, the log file will contain * information on all contexts of a given PID. */ binderfs_entry = binderfs_create_file(binder_binderfs_dir_entry_proc, strbuf, &proc_fops, (void *)(unsigned long)proc->pid); if (!IS_ERR(binderfs_entry)) { proc->binderfs_entry = binderfs_entry; } else { int error; error = PTR_ERR(binderfs_entry); pr_warn("Unable to create file %s in binderfs (error %d)\n", strbuf, error); } } return 0; } static int binder_flush(struct file *filp, fl_owner_t id) { struct binder_proc *proc = filp->private_data; binder_defer_work(proc, BINDER_DEFERRED_FLUSH); return 0; } static void binder_deferred_flush(struct binder_proc *proc) { struct rb_node *n; int wake_count = 0; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node); thread->looper_need_return = true; if (thread->looper & BINDER_LOOPER_STATE_WAITING) { wake_up_interruptible(&thread->wait); wake_count++; } } binder_inner_proc_unlock(proc); binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_flush: %d woke %d threads\n", proc->pid, wake_count); } static int binder_release(struct inode *nodp, struct file *filp) { struct binder_proc *proc = filp->private_data; debugfs_remove(proc->debugfs_entry); if (proc->binderfs_entry) { binderfs_remove_file(proc->binderfs_entry); proc->binderfs_entry = NULL; } binder_defer_work(proc, BINDER_DEFERRED_RELEASE); return 0; } static int binder_node_release(struct binder_node *node, int refs) { struct binder_ref *ref; int death = 0; struct binder_proc *proc = node->proc; binder_release_work(proc, &node->async_todo); binder_node_lock(node); binder_inner_proc_lock(proc); binder_dequeue_work_ilocked(&node->work); /* * The caller must have taken a temporary ref on the node, */ BUG_ON(!node->tmp_refs); if (hlist_empty(&node->refs) && node->tmp_refs == 1) { binder_inner_proc_unlock(proc); binder_node_unlock(node); binder_free_node(node); return refs; } node->proc = NULL; node->local_strong_refs = 0; node->local_weak_refs = 0; binder_inner_proc_unlock(proc); spin_lock(&binder_dead_nodes_lock); hlist_add_head(&node->dead_node, &binder_dead_nodes); spin_unlock(&binder_dead_nodes_lock); hlist_for_each_entry(ref, &node->refs, node_entry) { refs++; /* * Need the node lock to synchronize * with new notification requests and the * inner lock to synchronize with queued * death notifications. */ binder_inner_proc_lock(ref->proc); if (!ref->death) { binder_inner_proc_unlock(ref->proc); continue; } death++; BUG_ON(!list_empty(&ref->death->work.entry)); ref->death->work.type = BINDER_WORK_DEAD_BINDER; binder_enqueue_work_ilocked(&ref->death->work, &ref->proc->todo); binder_wakeup_proc_ilocked(ref->proc); binder_inner_proc_unlock(ref->proc); } binder_debug(BINDER_DEBUG_DEAD_BINDER, "node %d now dead, refs %d, death %d\n", node->debug_id, refs, death); binder_node_unlock(node); binder_put_node(node); return refs; } static void binder_deferred_release(struct binder_proc *proc) { struct binder_context *context = proc->context; struct rb_node *n; int threads, nodes, incoming_refs, outgoing_refs, active_transactions; mutex_lock(&binder_procs_lock); hlist_del(&proc->proc_node); mutex_unlock(&binder_procs_lock); mutex_lock(&context->context_mgr_node_lock); if (context->binder_context_mgr_node && context->binder_context_mgr_node->proc == proc) { binder_debug(BINDER_DEBUG_DEAD_BINDER, "%s: %d context_mgr_node gone\n", __func__, proc->pid); context->binder_context_mgr_node = NULL; } mutex_unlock(&context->context_mgr_node_lock); binder_inner_proc_lock(proc); /* * Make sure proc stays alive after we * remove all the threads */ proc->tmp_ref++; proc->is_dead = true; proc->is_frozen = false; proc->sync_recv = false; proc->async_recv = false; threads = 0; active_transactions = 0; while ((n = rb_first(&proc->threads))) { struct binder_thread *thread; thread = rb_entry(n, struct binder_thread, rb_node); binder_inner_proc_unlock(proc); threads++; active_transactions += binder_thread_release(proc, thread); binder_inner_proc_lock(proc); } nodes = 0; incoming_refs = 0; while ((n = rb_first(&proc->nodes))) { struct binder_node *node; node = rb_entry(n, struct binder_node, rb_node); nodes++; /* * take a temporary ref on the node before * calling binder_node_release() which will either * kfree() the node or call binder_put_node() */ binder_inc_node_tmpref_ilocked(node); rb_erase(&node->rb_node, &proc->nodes); binder_inner_proc_unlock(proc); incoming_refs = binder_node_release(node, incoming_refs); binder_inner_proc_lock(proc); } binder_inner_proc_unlock(proc); outgoing_refs = 0; binder_proc_lock(proc); while ((n = rb_first(&proc->refs_by_desc))) { struct binder_ref *ref; ref = rb_entry(n, struct binder_ref, rb_node_desc); outgoing_refs++; binder_cleanup_ref_olocked(ref); binder_proc_unlock(proc); binder_free_ref(ref); binder_proc_lock(proc); } binder_proc_unlock(proc); binder_release_work(proc, &proc->todo); binder_release_work(proc, &proc->delivered_death); binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d\n", __func__, proc->pid, threads, nodes, incoming_refs, outgoing_refs, active_transactions); binder_proc_dec_tmpref(proc); } static void binder_deferred_func(struct work_struct *work) { struct binder_proc *proc; int defer; do { mutex_lock(&binder_deferred_lock); if (!hlist_empty(&binder_deferred_list)) { proc = hlist_entry(binder_deferred_list.first, struct binder_proc, deferred_work_node); hlist_del_init(&proc->deferred_work_node); defer = proc->deferred_work; proc->deferred_work = 0; } else { proc = NULL; defer = 0; } mutex_unlock(&binder_deferred_lock); if (defer & BINDER_DEFERRED_FLUSH) binder_deferred_flush(proc); if (defer & BINDER_DEFERRED_RELEASE) binder_deferred_release(proc); /* frees proc */ } while (proc); } static DECLARE_WORK(binder_deferred_work, binder_deferred_func); static void binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) { mutex_lock(&binder_deferred_lock); proc->deferred_work |= defer; if (hlist_unhashed(&proc->deferred_work_node)) { hlist_add_head(&proc->deferred_work_node, &binder_deferred_list); schedule_work(&binder_deferred_work); } mutex_unlock(&binder_deferred_lock); } static void print_binder_transaction_ilocked(struct seq_file *m, struct binder_proc *proc, const char *prefix, struct binder_transaction *t) { struct binder_proc *to_proc; struct binder_buffer *buffer = t->buffer; ktime_t current_time = ktime_get(); spin_lock(&t->lock); to_proc = t->to_proc; seq_printf(m, "%s %d: %pK from %d:%d to %d:%d code %x flags %x pri %ld r%d elapsed %lldms", prefix, t->debug_id, t, t->from_pid, t->from_tid, to_proc ? to_proc->pid : 0, t->to_thread ? t->to_thread->pid : 0, t->code, t->flags, t->priority, t->need_reply, ktime_ms_delta(current_time, t->start_time)); spin_unlock(&t->lock); if (proc != to_proc) { /* * Can only safely deref buffer if we are holding the * correct proc inner lock for this node */ seq_puts(m, "\n"); return; } if (buffer == NULL) { seq_puts(m, " buffer free\n"); return; } if (buffer->target_node) seq_printf(m, " node %d", buffer->target_node->debug_id); seq_printf(m, " size %zd:%zd offset %lx\n", buffer->data_size, buffer->offsets_size, proc->alloc.buffer - buffer->user_data); } static void print_binder_work_ilocked(struct seq_file *m, struct binder_proc *proc, const char *prefix, const char *transaction_prefix, struct binder_work *w) { struct binder_node *node; struct binder_transaction *t; switch (w->type) { case BINDER_WORK_TRANSACTION: t = container_of(w, struct binder_transaction, work); print_binder_transaction_ilocked( m, proc, transaction_prefix, t); break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( w, struct binder_error, work); seq_printf(m, "%stransaction error: %u\n", prefix, e->cmd); } break; case BINDER_WORK_TRANSACTION_COMPLETE: seq_printf(m, "%stransaction complete\n", prefix); break; case BINDER_WORK_NODE: node = container_of(w, struct binder_node, work); seq_printf(m, "%snode work %d: u%016llx c%016llx\n", prefix, node->debug_id, (u64)node->ptr, (u64)node->cookie); break; case BINDER_WORK_DEAD_BINDER: seq_printf(m, "%shas dead binder\n", prefix); break; case BINDER_WORK_DEAD_BINDER_AND_CLEAR: seq_printf(m, "%shas cleared dead binder\n", prefix); break; case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: seq_printf(m, "%shas cleared death notification\n", prefix); break; default: seq_printf(m, "%sunknown work: type %d\n", prefix, w->type); break; } } static void print_binder_thread_ilocked(struct seq_file *m, struct binder_thread *thread, int print_always) { struct binder_transaction *t; struct binder_work *w; size_t start_pos = m->count; size_t header_pos; seq_printf(m, " thread %d: l %02x need_return %d tr %d\n", thread->pid, thread->looper, thread->looper_need_return, atomic_read(&thread->tmp_ref)); header_pos = m->count; t = thread->transaction_stack; while (t) { if (t->from == thread) { print_binder_transaction_ilocked(m, thread->proc, " outgoing transaction", t); t = t->from_parent; } else if (t->to_thread == thread) { print_binder_transaction_ilocked(m, thread->proc, " incoming transaction", t); t = t->to_parent; } else { print_binder_transaction_ilocked(m, thread->proc, " bad transaction", t); t = NULL; } } list_for_each_entry(w, &thread->todo, entry) { print_binder_work_ilocked(m, thread->proc, " ", " pending transaction", w); } if (!print_always && m->count == header_pos) m->count = start_pos; } static void print_binder_node_nilocked(struct seq_file *m, struct binder_node *node) { struct binder_ref *ref; struct binder_work *w; int count; count = hlist_count_nodes(&node->refs); seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d", node->debug_id, (u64)node->ptr, (u64)node->cookie, node->has_strong_ref, node->has_weak_ref, node->local_strong_refs, node->local_weak_refs, node->internal_strong_refs, count, node->tmp_refs); if (count) { seq_puts(m, " proc"); hlist_for_each_entry(ref, &node->refs, node_entry) seq_printf(m, " %d", ref->proc->pid); } seq_puts(m, "\n"); if (node->proc) { list_for_each_entry(w, &node->async_todo, entry) print_binder_work_ilocked(m, node->proc, " ", " pending async transaction", w); } } static void print_binder_ref_olocked(struct seq_file *m, struct binder_ref *ref) { binder_node_lock(ref->node); seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %pK\n", ref->data.debug_id, ref->data.desc, ref->node->proc ? "" : "dead ", ref->node->debug_id, ref->data.strong, ref->data.weak, ref->death); binder_node_unlock(ref->node); } static void print_binder_proc(struct seq_file *m, struct binder_proc *proc, int print_all) { struct binder_work *w; struct rb_node *n; size_t start_pos = m->count; size_t header_pos; struct binder_node *last_node = NULL; seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); header_pos = m->count; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) print_binder_thread_ilocked(m, rb_entry(n, struct binder_thread, rb_node), print_all); for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { struct binder_node *node = rb_entry(n, struct binder_node, rb_node); if (!print_all && !node->has_async_transaction) continue; /* * take a temporary reference on the node so it * survives and isn't removed from the tree * while we print it. */ binder_inc_node_tmpref_ilocked(node); /* Need to drop inner lock to take node lock */ binder_inner_proc_unlock(proc); if (last_node) binder_put_node(last_node); binder_node_inner_lock(node); print_binder_node_nilocked(m, node); binder_node_inner_unlock(node); last_node = node; binder_inner_proc_lock(proc); } binder_inner_proc_unlock(proc); if (last_node) binder_put_node(last_node); if (print_all) { binder_proc_lock(proc); for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) print_binder_ref_olocked(m, rb_entry(n, struct binder_ref, rb_node_desc)); binder_proc_unlock(proc); } binder_alloc_print_allocated(m, &proc->alloc); binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) print_binder_work_ilocked(m, proc, " ", " pending transaction", w); list_for_each_entry(w, &proc->delivered_death, entry) { seq_puts(m, " has delivered dead binder\n"); break; } binder_inner_proc_unlock(proc); if (!print_all && m->count == header_pos) m->count = start_pos; } static const char * const binder_return_strings[] = { "BR_ERROR", "BR_OK", "BR_TRANSACTION", "BR_REPLY", "BR_ACQUIRE_RESULT", "BR_DEAD_REPLY", "BR_TRANSACTION_COMPLETE", "BR_INCREFS", "BR_ACQUIRE", "BR_RELEASE", "BR_DECREFS", "BR_ATTEMPT_ACQUIRE", "BR_NOOP", "BR_SPAWN_LOOPER", "BR_FINISHED", "BR_DEAD_BINDER", "BR_CLEAR_DEATH_NOTIFICATION_DONE", "BR_FAILED_REPLY", "BR_FROZEN_REPLY", "BR_ONEWAY_SPAM_SUSPECT", "BR_TRANSACTION_PENDING_FROZEN" }; static const char * const binder_command_strings[] = { "BC_TRANSACTION", "BC_REPLY", "BC_ACQUIRE_RESULT", "BC_FREE_BUFFER", "BC_INCREFS", "BC_ACQUIRE", "BC_RELEASE", "BC_DECREFS", "BC_INCREFS_DONE", "BC_ACQUIRE_DONE", "BC_ATTEMPT_ACQUIRE", "BC_REGISTER_LOOPER", "BC_ENTER_LOOPER", "BC_EXIT_LOOPER", "BC_REQUEST_DEATH_NOTIFICATION", "BC_CLEAR_DEATH_NOTIFICATION", "BC_DEAD_BINDER_DONE", "BC_TRANSACTION_SG", "BC_REPLY_SG", }; static const char * const binder_objstat_strings[] = { "proc", "thread", "node", "ref", "death", "transaction", "transaction_complete" }; static void print_binder_stats(struct seq_file *m, const char *prefix, struct binder_stats *stats) { int i; BUILD_BUG_ON(ARRAY_SIZE(stats->bc) != ARRAY_SIZE(binder_command_strings)); for (i = 0; i < ARRAY_SIZE(stats->bc); i++) { int temp = atomic_read(&stats->bc[i]); if (temp) seq_printf(m, "%s%s: %d\n", prefix, binder_command_strings[i], temp); } BUILD_BUG_ON(ARRAY_SIZE(stats->br) != ARRAY_SIZE(binder_return_strings)); for (i = 0; i < ARRAY_SIZE(stats->br); i++) { int temp = atomic_read(&stats->br[i]); if (temp) seq_printf(m, "%s%s: %d\n", prefix, binder_return_strings[i], temp); } BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != ARRAY_SIZE(binder_objstat_strings)); BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != ARRAY_SIZE(stats->obj_deleted)); for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) { int created = atomic_read(&stats->obj_created[i]); int deleted = atomic_read(&stats->obj_deleted[i]); if (created || deleted) seq_printf(m, "%s%s: active %d total %d\n", prefix, binder_objstat_strings[i], created - deleted, created); } } static void print_binder_proc_stats(struct seq_file *m, struct binder_proc *proc) { struct binder_work *w; struct binder_thread *thread; struct rb_node *n; int count, strong, weak, ready_threads; size_t free_async_space = binder_alloc_get_free_async_space(&proc->alloc); seq_printf(m, "proc %d\n", proc->pid); seq_printf(m, "context %s\n", proc->context->name); count = 0; ready_threads = 0; binder_inner_proc_lock(proc); for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) count++; list_for_each_entry(thread, &proc->waiting_threads, waiting_thread_node) ready_threads++; seq_printf(m, " threads: %d\n", count); seq_printf(m, " requested threads: %d+%d/%d\n" " ready threads %d\n" " free async space %zd\n", proc->requested_threads, proc->requested_threads_started, proc->max_threads, ready_threads, free_async_space); count = 0; for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) count++; binder_inner_proc_unlock(proc); seq_printf(m, " nodes: %d\n", count); count = 0; strong = 0; weak = 0; binder_proc_lock(proc); for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { struct binder_ref *ref = rb_entry(n, struct binder_ref, rb_node_desc); count++; strong += ref->data.strong; weak += ref->data.weak; } binder_proc_unlock(proc); seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); count = binder_alloc_get_allocated_count(&proc->alloc); seq_printf(m, " buffers: %d\n", count); binder_alloc_print_pages(m, &proc->alloc); count = 0; binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) { if (w->type == BINDER_WORK_TRANSACTION) count++; } binder_inner_proc_unlock(proc); seq_printf(m, " pending transactions: %d\n", count); print_binder_stats(m, " ", &proc->stats); } static int state_show(struct seq_file *m, void *unused) { struct binder_proc *proc; struct binder_node *node; struct binder_node *last_node = NULL; seq_puts(m, "binder state:\n"); spin_lock(&binder_dead_nodes_lock); if (!hlist_empty(&binder_dead_nodes)) seq_puts(m, "dead nodes:\n"); hlist_for_each_entry(node, &binder_dead_nodes, dead_node) { /* * take a temporary reference on the node so it * survives and isn't removed from the list * while we print it. */ node->tmp_refs++; spin_unlock(&binder_dead_nodes_lock); if (last_node) binder_put_node(last_node); binder_node_lock(node); print_binder_node_nilocked(m, node); binder_node_unlock(node); last_node = node; spin_lock(&binder_dead_nodes_lock); } spin_unlock(&binder_dead_nodes_lock); if (last_node) binder_put_node(last_node); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 1); mutex_unlock(&binder_procs_lock); return 0; } static int stats_show(struct seq_file *m, void *unused) { struct binder_proc *proc; seq_puts(m, "binder stats:\n"); print_binder_stats(m, "", &binder_stats); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc_stats(m, proc); mutex_unlock(&binder_procs_lock); return 0; } static int transactions_show(struct seq_file *m, void *unused) { struct binder_proc *proc; seq_puts(m, "binder transactions:\n"); mutex_lock(&binder_procs_lock); hlist_for_each_entry(proc, &binder_procs, proc_node) print_binder_proc(m, proc, 0); mutex_unlock(&binder_procs_lock); return 0; } static int proc_show(struct seq_file *m, void *unused) { struct binder_proc *itr; int pid = (unsigned long)m->private; mutex_lock(&binder_procs_lock); hlist_for_each_entry(itr, &binder_procs, proc_node) { if (itr->pid == pid) { seq_puts(m, "binder proc state:\n"); print_binder_proc(m, itr, 1); } } mutex_unlock(&binder_procs_lock); return 0; } static void print_binder_transaction_log_entry(struct seq_file *m, struct binder_transaction_log_entry *e) { int debug_id = READ_ONCE(e->debug_id_done); /* * read barrier to guarantee debug_id_done read before * we print the log values */ smp_rmb(); seq_printf(m, "%d: %s from %d:%d to %d:%d context %s node %d handle %d size %d:%d ret %d/%d l=%d", e->debug_id, (e->call_type == 2) ? "reply" : ((e->call_type == 1) ? "async" : "call "), e->from_proc, e->from_thread, e->to_proc, e->to_thread, e->context_name, e->to_node, e->target_handle, e->data_size, e->offsets_size, e->return_error, e->return_error_param, e->return_error_line); /* * read-barrier to guarantee read of debug_id_done after * done printing the fields of the entry */ smp_rmb(); seq_printf(m, debug_id && debug_id == READ_ONCE(e->debug_id_done) ? "\n" : " (incomplete)\n"); } static int transaction_log_show(struct seq_file *m, void *unused) { struct binder_transaction_log *log = m->private; unsigned int log_cur = atomic_read(&log->cur); unsigned int count; unsigned int cur; int i; count = log_cur + 1; cur = count < ARRAY_SIZE(log->entry) && !log->full ? 0 : count % ARRAY_SIZE(log->entry); if (count > ARRAY_SIZE(log->entry) || log->full) count = ARRAY_SIZE(log->entry); for (i = 0; i < count; i++) { unsigned int index = cur++ % ARRAY_SIZE(log->entry); print_binder_transaction_log_entry(m, &log->entry[index]); } return 0; } const struct file_operations binder_fops = { .owner = THIS_MODULE, .poll = binder_poll, .unlocked_ioctl = binder_ioctl, .compat_ioctl = compat_ptr_ioctl, .mmap = binder_mmap, .open = binder_open, .flush = binder_flush, .release = binder_release, }; DEFINE_SHOW_ATTRIBUTE(state); DEFINE_SHOW_ATTRIBUTE(stats); DEFINE_SHOW_ATTRIBUTE(transactions); DEFINE_SHOW_ATTRIBUTE(transaction_log); const struct binder_debugfs_entry binder_debugfs_entries[] = { { .name = "state", .mode = 0444, .fops = &state_fops, .data = NULL, }, { .name = "stats", .mode = 0444, .fops = &stats_fops, .data = NULL, }, { .name = "transactions", .mode = 0444, .fops = &transactions_fops, .data = NULL, }, { .name = "transaction_log", .mode = 0444, .fops = &transaction_log_fops, .data = &binder_transaction_log, }, { .name = "failed_transaction_log", .mode = 0444, .fops = &transaction_log_fops, .data = &binder_transaction_log_failed, }, {} /* terminator */ }; static int __init init_binder_device(const char *name) { int ret; struct binder_device *binder_device; binder_device = kzalloc(sizeof(*binder_device), GFP_KERNEL); if (!binder_device) return -ENOMEM; binder_device->miscdev.fops = &binder_fops; binder_device->miscdev.minor = MISC_DYNAMIC_MINOR; binder_device->miscdev.name = name; refcount_set(&binder_device->ref, 1); binder_device->context.binder_context_mgr_uid = INVALID_UID; binder_device->context.name = name; mutex_init(&binder_device->context.context_mgr_node_lock); ret = misc_register(&binder_device->miscdev); if (ret < 0) { kfree(binder_device); return ret; } hlist_add_head(&binder_device->hlist, &binder_devices); return ret; } static int __init binder_init(void) { int ret; char *device_name, *device_tmp; struct binder_device *device; struct hlist_node *tmp; char *device_names = NULL; const struct binder_debugfs_entry *db_entry; ret = binder_alloc_shrinker_init(); if (ret) return ret; atomic_set(&binder_transaction_log.cur, ~0U); atomic_set(&binder_transaction_log_failed.cur, ~0U); binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); binder_for_each_debugfs_entry(db_entry) debugfs_create_file(db_entry->name, db_entry->mode, binder_debugfs_dir_entry_root, db_entry->data, db_entry->fops); binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", binder_debugfs_dir_entry_root); if (!IS_ENABLED(CONFIG_ANDROID_BINDERFS) && strcmp(binder_devices_param, "") != 0) { /* * Copy the module_parameter string, because we don't want to * tokenize it in-place. */ device_names = kstrdup(binder_devices_param, GFP_KERNEL); if (!device_names) { ret = -ENOMEM; goto err_alloc_device_names_failed; } device_tmp = device_names; while ((device_name = strsep(&device_tmp, ","))) { ret = init_binder_device(device_name); if (ret) goto err_init_binder_device_failed; } } ret = init_binderfs(); if (ret) goto err_init_binder_device_failed; return ret; err_init_binder_device_failed: hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { misc_deregister(&device->miscdev); hlist_del(&device->hlist); kfree(device); } kfree(device_names); err_alloc_device_names_failed: debugfs_remove_recursive(binder_debugfs_dir_entry_root); binder_alloc_shrinker_exit(); return ret; } device_initcall(binder_init); #define CREATE_TRACE_POINTS #include "binder_trace.h" MODULE_LICENSE("GPL v2"); |
| 18 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Authentication token and access key management internal defs * * Copyright (C) 2003-5, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _INTERNAL_H #define _INTERNAL_H #include <linux/sched.h> #include <linux/wait_bit.h> #include <linux/cred.h> #include <linux/key-type.h> #include <linux/task_work.h> #include <linux/keyctl.h> #include <linux/refcount.h> #include <linux/watch_queue.h> #include <linux/compat.h> #include <linux/mm.h> #include <linux/vmalloc.h> struct iovec; #ifdef __KDEBUG #define kenter(FMT, ...) \ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) \ printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) \ printk(KERN_DEBUG " "FMT"\n", ##__VA_ARGS__) #else #define kenter(FMT, ...) \ no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) \ no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) \ no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) #endif extern struct key_type key_type_dead; extern struct key_type key_type_user; extern struct key_type key_type_logon; /*****************************************************************************/ /* * Keep track of keys for a user. * * This needs to be separate to user_struct to avoid a refcount-loop * (user_struct pins some keyrings which pin this struct). * * We also keep track of keys under request from userspace for this UID here. */ struct key_user { struct rb_node node; struct mutex cons_lock; /* construction initiation lock */ spinlock_t lock; refcount_t usage; /* for accessing qnkeys & qnbytes */ atomic_t nkeys; /* number of keys */ atomic_t nikeys; /* number of instantiated keys */ kuid_t uid; int qnkeys; /* number of keys allocated to this user */ int qnbytes; /* number of bytes allocated to this user */ }; extern struct rb_root key_user_tree; extern spinlock_t key_user_lock; extern struct key_user root_key_user; extern struct key_user *key_user_lookup(kuid_t uid); extern void key_user_put(struct key_user *user); /* * Key quota limits. * - root has its own separate limits to everyone else */ extern unsigned key_quota_root_maxkeys; extern unsigned key_quota_root_maxbytes; extern unsigned key_quota_maxkeys; extern unsigned key_quota_maxbytes; #define KEYQUOTA_LINK_BYTES 4 /* a link in a keyring is worth 4 bytes */ extern struct kmem_cache *key_jar; extern struct rb_root key_serial_tree; extern spinlock_t key_serial_lock; extern struct mutex key_construction_mutex; extern wait_queue_head_t request_key_conswq; extern void key_set_index_key(struct keyring_index_key *index_key); extern struct key_type *key_type_lookup(const char *type); extern void key_type_put(struct key_type *ktype); extern int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key); extern int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key); extern int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit); extern int __key_link_check_live_key(struct key *keyring, struct key *key); extern void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit); extern void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit); extern key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key); struct keyring_search_context { struct keyring_index_key index_key; const struct cred *cred; struct key_match_data match_data; unsigned flags; #define KEYRING_SEARCH_NO_STATE_CHECK 0x0001 /* Skip state checks */ #define KEYRING_SEARCH_DO_STATE_CHECK 0x0002 /* Override NO_STATE_CHECK */ #define KEYRING_SEARCH_NO_UPDATE_TIME 0x0004 /* Don't update times */ #define KEYRING_SEARCH_NO_CHECK_PERM 0x0008 /* Don't check permissions */ #define KEYRING_SEARCH_DETECT_TOO_DEEP 0x0010 /* Give an error on excessive depth */ #define KEYRING_SEARCH_SKIP_EXPIRED 0x0020 /* Ignore expired keys (intention to replace) */ #define KEYRING_SEARCH_RECURSE 0x0040 /* Search child keyrings also */ int (*iterator)(const void *object, void *iterator_data); /* Internal stuff */ int skipped_ret; bool possessed; key_ref_t result; time64_t now; }; extern bool key_default_cmp(const struct key *key, const struct key_match_data *match_data); extern key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx); extern key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx); extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int look_up_user_keyrings(struct key **, struct key **); extern struct key *get_user_session_keyring_rcu(const struct cred *); extern int install_thread_keyring_to_cred(struct cred *); extern int install_process_keyring_to_cred(struct cred *); extern int install_session_keyring_to_cred(struct cred *, struct key *); extern struct key *request_key_and_link(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags); extern bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data); extern long join_session_keyring(const char *name); extern void key_change_session_keyring(struct callback_head *twork); extern struct work_struct key_gc_work; extern unsigned key_gc_delay; extern void keyring_gc(struct key *keyring, time64_t limit); extern void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type); void key_set_expiry(struct key *key, time64_t expiry); extern void key_schedule_gc(time64_t gc_at); extern void key_schedule_gc_links(void); extern void key_gc_keytype(struct key_type *ktype); extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm); static inline void notify_key(struct key *key, enum key_notification_subtype subtype, u32 aux) { #ifdef CONFIG_KEY_NOTIFICATIONS struct key_notification n = { .watch.type = WATCH_TYPE_KEY_NOTIFY, .watch.subtype = subtype, .watch.info = watch_sizeof(n), .key_id = key_serial(key), .aux = aux, }; post_watch_notification(key->watchers, &n.watch, current_cred(), n.key_id); #endif } /* * Check to see whether permission is granted to use a key in the desired way. */ static inline int key_permission(const key_ref_t key_ref, enum key_need_perm need_perm) { return key_task_permission(key_ref, current_cred(), need_perm); } extern struct key_type key_type_request_key_auth; extern struct key *request_key_auth_new(struct key *target, const char *op, const void *callout_info, size_t callout_len, struct key *dest_keyring); extern struct key *key_get_instantiation_authkey(key_serial_t target_id); /* * Determine whether a key is dead. */ static inline bool key_is_dead(const struct key *key, time64_t limit) { time64_t expiry = key->expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; if (expiry <= limit) return true; } return key->flags & ((1 << KEY_FLAG_DEAD) | (1 << KEY_FLAG_INVALIDATED)) || key->domain_tag->removed; } /* * keyctl() functions */ extern long keyctl_get_keyring_ID(key_serial_t, int); extern long keyctl_join_session_keyring(const char __user *); extern long keyctl_update_key(key_serial_t, const void __user *, size_t); extern long keyctl_revoke_key(key_serial_t); extern long keyctl_keyring_clear(key_serial_t); extern long keyctl_keyring_link(key_serial_t, key_serial_t); extern long keyctl_keyring_move(key_serial_t, key_serial_t, key_serial_t, unsigned int); extern long keyctl_keyring_unlink(key_serial_t, key_serial_t); extern long keyctl_describe_key(key_serial_t, char __user *, size_t); extern long keyctl_keyring_search(key_serial_t, const char __user *, const char __user *, key_serial_t); extern long keyctl_read_key(key_serial_t, char __user *, size_t); extern long keyctl_chown_key(key_serial_t, uid_t, gid_t); extern long keyctl_setperm_key(key_serial_t, key_perm_t); extern long keyctl_instantiate_key(key_serial_t, const void __user *, size_t, key_serial_t); extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t); extern long keyctl_set_reqkey_keyring(int); extern long keyctl_set_timeout(key_serial_t, unsigned); extern long keyctl_assume_authority(key_serial_t); extern long keyctl_get_security(key_serial_t keyid, char __user *buffer, size_t buflen); extern long keyctl_session_to_parent(void); extern long keyctl_reject_key(key_serial_t, unsigned, unsigned, key_serial_t); extern long keyctl_instantiate_key_iov(key_serial_t, const struct iovec __user *, unsigned, key_serial_t); extern long keyctl_invalidate_key(key_serial_t); extern long keyctl_restrict_keyring(key_serial_t id, const char __user *_type, const char __user *_restriction); #ifdef CONFIG_PERSISTENT_KEYRINGS extern long keyctl_get_persistent(uid_t, key_serial_t); extern unsigned persistent_keyring_expiry; #else static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_KEY_DH_OPERATIONS extern long keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *, size_t, struct keyctl_kdf_params __user *); extern long __keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *, size_t, struct keyctl_kdf_params *); #ifdef CONFIG_COMPAT extern long compat_keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct compat_keyctl_kdf_params __user *kdf); #endif #define KEYCTL_KDF_MAX_OUTPUT_LEN 1024 /* max length of KDF output */ #define KEYCTL_KDF_MAX_OI_LEN 64 /* max length of otherinfo */ #else static inline long keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params __user *kdf) { return -EOPNOTSUPP; } #ifdef CONFIG_COMPAT static inline long compat_keyctl_dh_compute( struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params __user *kdf) { return -EOPNOTSUPP; } #endif #endif #ifdef CONFIG_ASYMMETRIC_KEY_TYPE extern long keyctl_pkey_query(key_serial_t, const char __user *, struct keyctl_pkey_query __user *); extern long keyctl_pkey_verify(const struct keyctl_pkey_params __user *, const char __user *, const void __user *, const void __user *); extern long keyctl_pkey_e_d_s(int, const struct keyctl_pkey_params __user *, const char __user *, const void __user *, void __user *); #else static inline long keyctl_pkey_query(key_serial_t id, const char __user *_info, struct keyctl_pkey_query __user *_res) { return -EOPNOTSUPP; } static inline long keyctl_pkey_verify(const struct keyctl_pkey_params __user *params, const char __user *_info, const void __user *_in, const void __user *_in2) { return -EOPNOTSUPP; } static inline long keyctl_pkey_e_d_s(int op, const struct keyctl_pkey_params __user *params, const char __user *_info, const void __user *_in, void __user *_out) { return -EOPNOTSUPP; } #endif extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen); #ifdef CONFIG_KEY_NOTIFICATIONS extern long keyctl_watch_key(key_serial_t, int, int); #else static inline long keyctl_watch_key(key_serial_t key_id, int watch_fd, int watch_id) { return -EOPNOTSUPP; } #endif /* * Debugging key validation */ #ifdef KEY_DEBUGGING extern void __key_check(const struct key *); static inline void key_check(const struct key *key) { if (key && (IS_ERR(key) || key->magic != KEY_DEBUG_MAGIC)) __key_check(key); } #else #define key_check(key) do {} while(0) #endif #endif /* _INTERNAL_H */ |
| 177 24 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.h * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Zheng Liu <wenqing.lz@taobao.com> * */ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H /* * Turn on ES_DEBUG__ to get lots of info about extent status operations. */ #ifdef ES_DEBUG__ #define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * With ES_AGGRESSIVE_TEST defined, the result of es caching will be * checked with old map_block's result. */ #define ES_AGGRESSIVE_TEST__ /* * These flags live in the high bits of extent_status.es_pblk */ enum { ES_WRITTEN_B, ES_UNWRITTEN_B, ES_DELAYED_B, ES_HOLE_B, ES_REFERENCED_B, ES_FLAGS }; #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) #define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) << ES_SHIFT) struct ext4_sb_info; struct ext4_extent; struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ ext4_lblk_t es_len; /* length of extent in block */ ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { struct rb_root root; struct extent_status *cache_es; /* recently accessed extent */ }; struct ext4_es_stats { unsigned long es_stats_shrunk; struct percpu_counter es_stats_cache_hits; struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; struct percpu_counter es_stats_shk_cnt; }; /* * Pending cluster reservations for bigalloc file systems * * A cluster with a pending reservation is a logical cluster shared by at * least one extent in the extents status tree with delayed and unwritten * status and at least one other written or unwritten extent. The * reservation is said to be pending because a cluster reservation would * have to be taken in the event all blocks in the cluster shared with * written or unwritten extents were deleted while the delayed and * unwritten blocks remained. * * The set of pending cluster reservations is an auxiliary data structure * used with the extents status tree to implement reserved cluster/block * accounting for bigalloc file systems. The set is kept in memory and * records all pending cluster reservations. * * Its primary function is to avoid the need to read extents from the * disk when invalidating pages as a result of a truncate, punch hole, or * collapse range operation. Page invalidation requires a decrease in the * reserved cluster count if it results in the removal of all delayed * and unwritten extents (blocks) from a cluster that is not shared with a * written or unwritten extent, and no decrease otherwise. Determining * whether the cluster is shared can be done by searching for a pending * reservation on it. * * Secondarily, it provides a potentially faster method for determining * whether the reserved cluster count should be increased when a physical * cluster is deallocated as a result of a truncate, punch hole, or * collapse range operation. The necessary information is also present * in the extents status tree, but might be more rapidly accessed in * the pending reservation set in many cases due to smaller size. * * The pending cluster reservation set is implemented as a red-black tree * with the goal of minimizing per page search time overhead. */ struct pending_reservation { struct rb_node rb_node; ext4_lblk_t lclu; }; struct ext4_pending_tree { struct rb_root root; }; extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); extern bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { return es->es_pblk >> ES_SHIFT; } static inline unsigned int ext4_es_type(struct extent_status *es) { return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; } static inline int ext4_es_is_written(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } static inline int ext4_es_is_mapped(struct extent_status *es) { return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } static inline int ext4_es_is_delonly(struct extent_status *es) { return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); } static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; } static inline void ext4_es_clear_referenced(struct extent_status *es) { es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); } static inline int ext4_es_is_referenced(struct extent_status *es) { return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { return es->es_pblk & ~ES_MASK; } static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) { ext4_fsblk_t pblock = ext4_es_pblock(es); return pblock == ~ES_MASK ? 0 : pblock; } static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { ext4_fsblk_t block; block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_status(struct extent_status *es, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (es->es_pblk & ~ES_MASK); } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); extern int __init ext4_init_pending(void); extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, bool lclu_allocated, bool end_allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ |
| 33 34 33 33 34 33 33 5 5 34 2 32 34 34 33 34 33 34 34 34 34 34 33 34 32 34 34 33 33 33 33 34 34 34 32 34 30 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1999 Eric Youngdale * Copyright (C) 2014 Christoph Hellwig * * SCSI queueing library. * Initial versions: Eric Youngdale (eric@andante.org). * Based upon conversations with large numbers * of people at Linux Expo. */ #include <linux/bio.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/completion.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/delay.h> #include <linux/hardirq.h> #include <linux/scatterlist.h> #include <linux/blk-mq.h> #include <linux/blk-integrity.h> #include <linux/ratelimit.h> #include <asm/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_transport.h> /* scsi_init_limits() */ #include <scsi/scsi_dh.h> #include <trace/events/scsi.h> #include "scsi_debugfs.h" #include "scsi_priv.h" #include "scsi_logging.h" /* * Size of integrity metadata is usually small, 1 inline sg should * cover normal cases. */ #ifdef CONFIG_ARCH_NO_SG_CHAIN #define SCSI_INLINE_PROT_SG_CNT 0 #define SCSI_INLINE_SG_CNT 0 #else #define SCSI_INLINE_PROT_SG_CNT 1 #define SCSI_INLINE_SG_CNT 2 #endif static struct kmem_cache *scsi_sense_cache; static DEFINE_MUTEX(scsi_sense_cache_mutex); static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd); int scsi_init_sense_cache(struct Scsi_Host *shost) { int ret = 0; mutex_lock(&scsi_sense_cache_mutex); if (!scsi_sense_cache) { scsi_sense_cache = kmem_cache_create_usercopy("scsi_sense_cache", SCSI_SENSE_BUFFERSIZE, 0, SLAB_HWCACHE_ALIGN, 0, SCSI_SENSE_BUFFERSIZE, NULL); if (!scsi_sense_cache) ret = -ENOMEM; } mutex_unlock(&scsi_sense_cache_mutex); return ret; } static void scsi_set_blocked(struct scsi_cmnd *cmd, int reason) { struct Scsi_Host *host = cmd->device->host; struct scsi_device *device = cmd->device; struct scsi_target *starget = scsi_target(device); /* * Set the appropriate busy bit for the device/host. * * If the host/device isn't busy, assume that something actually * completed, and that we should be able to queue a command now. * * Note that the prior mid-layer assumption that any host could * always queue at least one command is now broken. The mid-layer * will implement a user specifiable stall (see * scsi_host.max_host_blocked and scsi_device.max_device_blocked) * if a command is requeued with no other commands outstanding * either for the device or for the host. */ switch (reason) { case SCSI_MLQUEUE_HOST_BUSY: atomic_set(&host->host_blocked, host->max_host_blocked); break; case SCSI_MLQUEUE_DEVICE_BUSY: case SCSI_MLQUEUE_EH_RETRY: atomic_set(&device->device_blocked, device->max_device_blocked); break; case SCSI_MLQUEUE_TARGET_BUSY: atomic_set(&starget->target_blocked, starget->max_target_blocked); break; } } static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd, unsigned long msecs) { struct request *rq = scsi_cmd_to_rq(cmd); if (rq->rq_flags & RQF_DONTPREP) { rq->rq_flags &= ~RQF_DONTPREP; scsi_mq_uninit_cmd(cmd); } else { WARN_ON_ONCE(true); } blk_mq_requeue_request(rq, false); if (!scsi_host_in_recovery(cmd->device->host)) blk_mq_delay_kick_requeue_list(rq->q, msecs); } /** * __scsi_queue_insert - private queue insertion * @cmd: The SCSI command being requeued * @reason: The reason for the requeue * @unbusy: Whether the queue should be unbusied * * This is a private queue insertion. The public interface * scsi_queue_insert() always assumes the queue should be unbusied * because it's always called before the completion. This function is * for a requeue after completion, which should only occur in this * file. */ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy) { struct scsi_device *device = cmd->device; SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd, "Inserting command %p into mlqueue\n", cmd)); scsi_set_blocked(cmd, reason); /* * Decrement the counters, since these commands are no longer * active on the host/device. */ if (unbusy) scsi_device_unbusy(device, cmd); /* * Requeue this command. It will go before all other commands * that are already in the queue. Schedule requeue work under * lock such that the kblockd_schedule_work() call happens * before blk_mq_destroy_queue() finishes. */ cmd->result = 0; blk_mq_requeue_request(scsi_cmd_to_rq(cmd), !scsi_host_in_recovery(cmd->device->host)); } /** * scsi_queue_insert - Reinsert a command in the queue. * @cmd: command that we are adding to queue. * @reason: why we are inserting command to queue. * * We do this for one of two cases. Either the host is busy and it cannot accept * any more commands for the time being, or the device returned QUEUE_FULL and * can accept no more commands. * * Context: This could be called either from an interrupt context or a normal * process context. */ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) { __scsi_queue_insert(cmd, reason, true); } void scsi_failures_reset_retries(struct scsi_failures *failures) { struct scsi_failure *failure; failures->total_retries = 0; for (failure = failures->failure_definitions; failure->result; failure++) failure->retries = 0; } EXPORT_SYMBOL_GPL(scsi_failures_reset_retries); /** * scsi_check_passthrough - Determine if passthrough scsi_cmnd needs a retry. * @scmd: scsi_cmnd to check. * @failures: scsi_failures struct that lists failures to check for. * * Returns -EAGAIN if the caller should retry else 0. */ static int scsi_check_passthrough(struct scsi_cmnd *scmd, struct scsi_failures *failures) { struct scsi_failure *failure; struct scsi_sense_hdr sshdr; enum sam_status status; if (!failures) return 0; for (failure = failures->failure_definitions; failure->result; failure++) { if (failure->result == SCMD_FAILURE_RESULT_ANY) goto maybe_retry; if (host_byte(scmd->result) && host_byte(scmd->result) == host_byte(failure->result)) goto maybe_retry; status = status_byte(scmd->result); if (!status) continue; if (failure->result == SCMD_FAILURE_STAT_ANY && !scsi_status_is_good(scmd->result)) goto maybe_retry; if (status != status_byte(failure->result)) continue; if (status_byte(failure->result) != SAM_STAT_CHECK_CONDITION || failure->sense == SCMD_FAILURE_SENSE_ANY) goto maybe_retry; if (!scsi_command_normalize_sense(scmd, &sshdr)) return 0; if (failure->sense != sshdr.sense_key) continue; if (failure->asc == SCMD_FAILURE_ASC_ANY) goto maybe_retry; if (failure->asc != sshdr.asc) continue; if (failure->ascq == SCMD_FAILURE_ASCQ_ANY || failure->ascq == sshdr.ascq) goto maybe_retry; } return 0; maybe_retry: if (failure->allowed) { if (failure->allowed == SCMD_FAILURE_NO_LIMIT || ++failure->retries <= failure->allowed) return -EAGAIN; } else { if (failures->total_allowed == SCMD_FAILURE_NO_LIMIT || ++failures->total_retries <= failures->total_allowed) return -EAGAIN; } return 0; } /** * scsi_execute_cmd - insert request and wait for the result * @sdev: scsi_device * @cmd: scsi command * @opf: block layer request cmd_flags * @buffer: data buffer * @bufflen: len of buffer * @timeout: request timeout in HZ * @ml_retries: number of times SCSI midlayer will retry request * @args: Optional args. See struct definition for field descriptions * * Returns the scsi_cmnd result field if a command was executed, or a negative * Linux error code if we didn't get that far. */ int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd, blk_opf_t opf, void *buffer, unsigned int bufflen, int timeout, int ml_retries, const struct scsi_exec_args *args) { static const struct scsi_exec_args default_args; struct request *req; struct scsi_cmnd *scmd; int ret; if (!args) args = &default_args; else if (WARN_ON_ONCE(args->sense && args->sense_len != SCSI_SENSE_BUFFERSIZE)) return -EINVAL; retry: req = scsi_alloc_request(sdev->request_queue, opf, args->req_flags); if (IS_ERR(req)) return PTR_ERR(req); if (bufflen) { ret = blk_rq_map_kern(sdev->request_queue, req, buffer, bufflen, GFP_NOIO); if (ret) goto out; } scmd = blk_mq_rq_to_pdu(req); scmd->cmd_len = COMMAND_SIZE(cmd[0]); memcpy(scmd->cmnd, cmd, scmd->cmd_len); scmd->allowed = ml_retries; scmd->flags |= args->scmd_flags; req->timeout = timeout; req->rq_flags |= RQF_QUIET; /* * head injection *required* here otherwise quiesce won't work */ blk_execute_rq(req, true); if (scsi_check_passthrough(scmd, args->failures) == -EAGAIN) { blk_mq_free_request(req); goto retry; } /* * Some devices (USB mass-storage in particular) may transfer * garbage data together with a residue indicating that the data * is invalid. Prevent the garbage from being misinterpreted * and prevent security leaks by zeroing out the excess data. */ if (unlikely(scmd->resid_len > 0 && scmd->resid_len <= bufflen)) memset(buffer + bufflen - scmd->resid_len, 0, scmd->resid_len); if (args->resid) *args->resid = scmd->resid_len; if (args->sense) memcpy(args->sense, scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE); if (args->sshdr) scsi_normalize_sense(scmd->sense_buffer, scmd->sense_len, args->sshdr); ret = scmd->result; out: blk_mq_free_request(req); return ret; } EXPORT_SYMBOL(scsi_execute_cmd); /* * Wake up the error handler if necessary. Avoid as follows that the error * handler is not woken up if host in-flight requests number == * shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination * with an RCU read lock in this function to ensure that this function in * its entirety either finishes before scsi_eh_scmd_add() increases the * host_failed counter or that it notices the shost state change made by * scsi_eh_scmd_add(). */ static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd) { unsigned long flags; rcu_read_lock(); __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state); if (unlikely(scsi_host_in_recovery(shost))) { unsigned int busy = scsi_host_busy(shost); spin_lock_irqsave(shost->host_lock, flags); if (shost->host_failed || shost->host_eh_scheduled) scsi_eh_wakeup(shost, busy); spin_unlock_irqrestore(shost->host_lock, flags); } rcu_read_unlock(); } void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd) { struct Scsi_Host *shost = sdev->host; struct scsi_target *starget = scsi_target(sdev); scsi_dec_host_busy(shost, cmd); if (starget->can_queue > 0) atomic_dec(&starget->target_busy); sbitmap_put(&sdev->budget_map, cmd->budget_token); cmd->budget_token = -1; } /* * Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with * interrupts disabled. */ static void scsi_kick_sdev_queue(struct scsi_device *sdev, void *data) { struct scsi_device *current_sdev = data; if (sdev != current_sdev) blk_mq_run_hw_queues(sdev->request_queue, true); } /* * Called for single_lun devices on IO completion. Clear starget_sdev_user, * and call blk_run_queue for all the scsi_devices on the target - * including current_sdev first. * * Called with *no* scsi locks held. */ static void scsi_single_lun_run(struct scsi_device *current_sdev) { struct Scsi_Host *shost = current_sdev->host; struct scsi_target *starget = scsi_target(current_sdev); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); starget->starget_sdev_user = NULL; spin_unlock_irqrestore(shost->host_lock, flags); /* * Call blk_run_queue for all LUNs on the target, starting with * current_sdev. We race with others (to set starget_sdev_user), * but in most cases, we will be first. Ideally, each LU on the * target would get some limited time or requests on the target. */ blk_mq_run_hw_queues(current_sdev->request_queue, shost->queuecommand_may_block); spin_lock_irqsave(shost->host_lock, flags); if (!starget->starget_sdev_user) __starget_for_each_device(starget, current_sdev, scsi_kick_sdev_queue); spin_unlock_irqrestore(shost->host_lock, flags); } static inline bool scsi_device_is_busy(struct scsi_device *sdev) { if (scsi_device_busy(sdev) >= sdev->queue_depth) return true; if (atomic_read(&sdev->device_blocked) > 0) return true; return false; } static inline bool scsi_target_is_busy(struct scsi_target *starget) { if (starget->can_queue > 0) { if (atomic_read(&starget->target_busy) >= starget->can_queue) return true; if (atomic_read(&starget->target_blocked) > 0) return true; } return false; } static inline bool scsi_host_is_busy(struct Scsi_Host *shost) { if (atomic_read(&shost->host_blocked) > 0) return true; if (shost->host_self_blocked) return true; return false; } static void scsi_starved_list_run(struct Scsi_Host *shost) { LIST_HEAD(starved_list); struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); list_splice_init(&shost->starved_list, &starved_list); while (!list_empty(&starved_list)) { struct request_queue *slq; /* * As long as shost is accepting commands and we have * starved queues, call blk_run_queue. scsi_request_fn * drops the queue_lock and can add us back to the * starved_list. * * host_lock protects the starved_list and starved_entry. * scsi_request_fn must get the host_lock before checking * or modifying starved_list or starved_entry. */ if (scsi_host_is_busy(shost)) break; sdev = list_entry(starved_list.next, struct scsi_device, starved_entry); list_del_init(&sdev->starved_entry); if (scsi_target_is_busy(scsi_target(sdev))) { list_move_tail(&sdev->starved_entry, &shost->starved_list); continue; } /* * Once we drop the host lock, a racing scsi_remove_device() * call may remove the sdev from the starved list and destroy * it and the queue. Mitigate by taking a reference to the * queue and never touching the sdev again after we drop the * host lock. Note: if __scsi_remove_device() invokes * blk_mq_destroy_queue() before the queue is run from this * function then blk_run_queue() will return immediately since * blk_mq_destroy_queue() marks the queue with QUEUE_FLAG_DYING. */ slq = sdev->request_queue; if (!blk_get_queue(slq)) continue; spin_unlock_irqrestore(shost->host_lock, flags); blk_mq_run_hw_queues(slq, false); blk_put_queue(slq); spin_lock_irqsave(shost->host_lock, flags); } /* put any unprocessed entries back */ list_splice(&starved_list, &shost->starved_list); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_run_queue - Select a proper request queue to serve next. * @q: last request's queue * * The previous command was completely finished, start a new one if possible. */ static void scsi_run_queue(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; if (scsi_target(sdev)->single_lun) scsi_single_lun_run(sdev); if (!list_empty(&sdev->host->starved_list)) scsi_starved_list_run(sdev->host); /* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */ blk_mq_kick_requeue_list(q); } void scsi_requeue_run_queue(struct work_struct *work) { struct scsi_device *sdev; struct request_queue *q; sdev = container_of(work, struct scsi_device, requeue_work); q = sdev->request_queue; scsi_run_queue(q); } void scsi_run_host_queues(struct Scsi_Host *shost) { struct scsi_device *sdev; shost_for_each_device(sdev, shost) scsi_run_queue(sdev->request_queue); } static void scsi_uninit_cmd(struct scsi_cmnd *cmd) { if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) { struct scsi_driver *drv = scsi_cmd_to_driver(cmd); if (drv->uninit_command) drv->uninit_command(cmd); } } void scsi_free_sgtables(struct scsi_cmnd *cmd) { if (cmd->sdb.table.nents) sg_free_table_chained(&cmd->sdb.table, SCSI_INLINE_SG_CNT); if (scsi_prot_sg_count(cmd)) sg_free_table_chained(&cmd->prot_sdb->table, SCSI_INLINE_PROT_SG_CNT); } EXPORT_SYMBOL_GPL(scsi_free_sgtables); static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd) { scsi_free_sgtables(cmd); scsi_uninit_cmd(cmd); } static void scsi_run_queue_async(struct scsi_device *sdev) { if (scsi_host_in_recovery(sdev->host)) return; if (scsi_target(sdev)->single_lun || !list_empty(&sdev->host->starved_list)) { kblockd_schedule_work(&sdev->requeue_work); } else { /* * smp_mb() present in sbitmap_queue_clear() or implied in * .end_io is for ordering writing .device_busy in * scsi_device_unbusy() and reading sdev->restarts. */ int old = atomic_read(&sdev->restarts); /* * ->restarts has to be kept as non-zero if new budget * contention occurs. * * No need to run queue when either another re-run * queue wins in updating ->restarts or a new budget * contention occurs. */ if (old && atomic_cmpxchg(&sdev->restarts, old, 0) == old) blk_mq_run_hw_queues(sdev->request_queue, true); } } /* Returns false when no more bytes to process, true if there are more */ static bool scsi_end_request(struct request *req, blk_status_t error, unsigned int bytes) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); struct scsi_device *sdev = cmd->device; struct request_queue *q = sdev->request_queue; if (blk_update_request(req, error, bytes)) return true; if (q->limits.features & BLK_FEAT_ADD_RANDOM) add_disk_randomness(req->q->disk); WARN_ON_ONCE(!blk_rq_is_passthrough(req) && !(cmd->flags & SCMD_INITIALIZED)); cmd->flags = 0; /* * Calling rcu_barrier() is not necessary here because the * SCSI error handler guarantees that the function called by * call_rcu() has been called before scsi_end_request() is * called. */ destroy_rcu_head(&cmd->rcu); /* * In the MQ case the command gets freed by __blk_mq_end_request, * so we have to do all cleanup that depends on it earlier. * * We also can't kick the queues from irq context, so we * will have to defer it to a workqueue. */ scsi_mq_uninit_cmd(cmd); /* * queue is still alive, so grab the ref for preventing it * from being cleaned up during running queue. */ percpu_ref_get(&q->q_usage_counter); __blk_mq_end_request(req, error); scsi_run_queue_async(sdev); percpu_ref_put(&q->q_usage_counter); return false; } /** * scsi_result_to_blk_status - translate a SCSI result code into blk_status_t * @result: scsi error code * * Translate a SCSI result code into a blk_status_t value. */ static blk_status_t scsi_result_to_blk_status(int result) { /* * Check the scsi-ml byte first in case we converted a host or status * byte. */ switch (scsi_ml_byte(result)) { case SCSIML_STAT_OK: break; case SCSIML_STAT_RESV_CONFLICT: return BLK_STS_RESV_CONFLICT; case SCSIML_STAT_NOSPC: return BLK_STS_NOSPC; case SCSIML_STAT_MED_ERROR: return BLK_STS_MEDIUM; case SCSIML_STAT_TGT_FAILURE: return BLK_STS_TARGET; case SCSIML_STAT_DL_TIMEOUT: return BLK_STS_DURATION_LIMIT; } switch (host_byte(result)) { case DID_OK: if (scsi_status_is_good(result)) return BLK_STS_OK; return BLK_STS_IOERR; case DID_TRANSPORT_FAILFAST: case DID_TRANSPORT_MARGINAL: return BLK_STS_TRANSPORT; default: return BLK_STS_IOERR; } } /** * scsi_rq_err_bytes - determine number of bytes till the next failure boundary * @rq: request to examine * * Description: * A request could be merge of IOs which require different failure * handling. This function determines the number of bytes which * can be failed from the beginning of the request without * crossing into area which need to be retried further. * * Return: * The number of bytes to fail. */ static unsigned int scsi_rq_err_bytes(const struct request *rq) { blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK; unsigned int bytes = 0; struct bio *bio; if (!(rq->rq_flags & RQF_MIXED_MERGE)) return blk_rq_bytes(rq); /* * Currently the only 'mixing' which can happen is between * different fastfail types. We can safely fail portions * which have all the failfast bits that the first one has - * the ones which are at least as eager to fail as the first * one. */ for (bio = rq->bio; bio; bio = bio->bi_next) { if ((bio->bi_opf & ff) != ff) break; bytes += bio->bi_iter.bi_size; } /* this could lead to infinite loop */ BUG_ON(blk_rq_bytes(rq) && !bytes); return bytes; } static bool scsi_cmd_runtime_exceeced(struct scsi_cmnd *cmd) { struct request *req = scsi_cmd_to_rq(cmd); unsigned long wait_for; if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) return false; wait_for = (cmd->allowed + 1) * req->timeout; if (time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) { scmd_printk(KERN_ERR, cmd, "timing out command, waited %lus\n", wait_for/HZ); return true; } return false; } /* * When ALUA transition state is returned, reprep the cmd to * use the ALUA handler's transition timeout. Delay the reprep * 1 sec to avoid aggressive retries of the target in that * state. */ #define ALUA_TRANSITION_REPREP_DELAY 1000 /* Helper for scsi_io_completion() when special action required. */ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) { struct request *req = scsi_cmd_to_rq(cmd); int level = 0; enum {ACTION_FAIL, ACTION_REPREP, ACTION_DELAYED_REPREP, ACTION_RETRY, ACTION_DELAYED_RETRY} action; struct scsi_sense_hdr sshdr; bool sense_valid; bool sense_current = true; /* false implies "deferred sense" */ blk_status_t blk_stat; sense_valid = scsi_command_normalize_sense(cmd, &sshdr); if (sense_valid) sense_current = !scsi_sense_is_deferred(&sshdr); blk_stat = scsi_result_to_blk_status(result); if (host_byte(result) == DID_RESET) { /* Third party bus reset or reset for error recovery * reasons. Just retry the command and see what * happens. */ action = ACTION_RETRY; } else if (sense_valid && sense_current) { switch (sshdr.sense_key) { case UNIT_ATTENTION: if (cmd->device->removable) { /* Detected disc change. Set a bit * and quietly refuse further access. */ cmd->device->changed = 1; action = ACTION_FAIL; } else { /* Must have been a power glitch, or a * bus reset. Could not have been a * media change, so we just retry the * command and see what happens. */ action = ACTION_RETRY; } break; case ILLEGAL_REQUEST: /* If we had an ILLEGAL REQUEST returned, then * we may have performed an unsupported * command. The only thing this should be * would be a ten byte read where only a six * byte read was supported. Also, on a system * where READ CAPACITY failed, we may have * read past the end of the disk. */ if ((cmd->device->use_10_for_rw && sshdr.asc == 0x20 && sshdr.ascq == 0x00) && (cmd->cmnd[0] == READ_10 || cmd->cmnd[0] == WRITE_10)) { /* This will issue a new 6-byte command. */ cmd->device->use_10_for_rw = 0; action = ACTION_REPREP; } else if (sshdr.asc == 0x10) /* DIX */ { action = ACTION_FAIL; blk_stat = BLK_STS_PROTECTION; /* INVALID COMMAND OPCODE or INVALID FIELD IN CDB */ } else if (sshdr.asc == 0x20 || sshdr.asc == 0x24) { action = ACTION_FAIL; blk_stat = BLK_STS_TARGET; } else action = ACTION_FAIL; break; case ABORTED_COMMAND: action = ACTION_FAIL; if (sshdr.asc == 0x10) /* DIF */ blk_stat = BLK_STS_PROTECTION; break; case NOT_READY: /* If the device is in the process of becoming * ready, or has a temporary blockage, retry. */ if (sshdr.asc == 0x04) { switch (sshdr.ascq) { case 0x01: /* becoming ready */ case 0x04: /* format in progress */ case 0x05: /* rebuild in progress */ case 0x06: /* recalculation in progress */ case 0x07: /* operation in progress */ case 0x08: /* Long write in progress */ case 0x09: /* self test in progress */ case 0x11: /* notify (enable spinup) required */ case 0x14: /* space allocation in progress */ case 0x1a: /* start stop unit in progress */ case 0x1b: /* sanitize in progress */ case 0x1d: /* configuration in progress */ case 0x24: /* depopulation in progress */ case 0x25: /* depopulation restore in progress */ action = ACTION_DELAYED_RETRY; break; case 0x0a: /* ALUA state transition */ action = ACTION_DELAYED_REPREP; break; default: action = ACTION_FAIL; break; } } else action = ACTION_FAIL; break; case VOLUME_OVERFLOW: /* See SSC3rXX or current. */ action = ACTION_FAIL; break; case DATA_PROTECT: action = ACTION_FAIL; if ((sshdr.asc == 0x0C && sshdr.ascq == 0x12) || (sshdr.asc == 0x55 && (sshdr.ascq == 0x0E || sshdr.ascq == 0x0F))) { /* Insufficient zone resources */ blk_stat = BLK_STS_ZONE_OPEN_RESOURCE; } break; case COMPLETED: fallthrough; default: action = ACTION_FAIL; break; } } else action = ACTION_FAIL; if (action != ACTION_FAIL && scsi_cmd_runtime_exceeced(cmd)) action = ACTION_FAIL; switch (action) { case ACTION_FAIL: /* Give up and fail the remainder of the request */ if (!(req->rq_flags & RQF_QUIET)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); if (unlikely(scsi_logging_level)) level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); /* * if logging is enabled the failure will be printed * in scsi_log_completion(), so avoid duplicate messages */ if (!level && __ratelimit(&_rs)) { scsi_print_result(cmd, NULL, FAILED); if (sense_valid) scsi_print_sense(cmd); scsi_print_command(cmd); } } if (!scsi_end_request(req, blk_stat, scsi_rq_err_bytes(req))) return; fallthrough; case ACTION_REPREP: scsi_mq_requeue_cmd(cmd, 0); break; case ACTION_DELAYED_REPREP: scsi_mq_requeue_cmd(cmd, ALUA_TRANSITION_REPREP_DELAY); break; case ACTION_RETRY: /* Retry the same command immediately */ __scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY, false); break; case ACTION_DELAYED_RETRY: /* Retry the same command after a delay */ __scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY, false); break; } } /* * Helper for scsi_io_completion() when cmd->result is non-zero. Returns a * new result that may suppress further error checking. Also modifies * *blk_statp in some cases. */ static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result, blk_status_t *blk_statp) { bool sense_valid; bool sense_current = true; /* false implies "deferred sense" */ struct request *req = scsi_cmd_to_rq(cmd); struct scsi_sense_hdr sshdr; sense_valid = scsi_command_normalize_sense(cmd, &sshdr); if (sense_valid) sense_current = !scsi_sense_is_deferred(&sshdr); if (blk_rq_is_passthrough(req)) { if (sense_valid) { /* * SG_IO wants current and deferred errors */ cmd->sense_len = min(8 + cmd->sense_buffer[7], SCSI_SENSE_BUFFERSIZE); } if (sense_current) *blk_statp = scsi_result_to_blk_status(result); } else if (blk_rq_bytes(req) == 0 && sense_current) { /* * Flush commands do not transfers any data, and thus cannot use * good_bytes != blk_rq_bytes(req) as the signal for an error. * This sets *blk_statp explicitly for the problem case. */ *blk_statp = scsi_result_to_blk_status(result); } /* * Recovered errors need reporting, but they're always treated as * success, so fiddle the result code here. For passthrough requests * we already took a copy of the original into sreq->result which * is what gets returned to the user */ if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) { bool do_print = true; /* * if ATA PASS-THROUGH INFORMATION AVAILABLE [0x0, 0x1d] * skip print since caller wants ATA registers. Only occurs * on SCSI ATA PASS_THROUGH commands when CK_COND=1 */ if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d)) do_print = false; else if (req->rq_flags & RQF_QUIET) do_print = false; if (do_print) scsi_print_sense(cmd); result = 0; /* for passthrough, *blk_statp may be set */ *blk_statp = BLK_STS_OK; } /* * Another corner case: the SCSI status byte is non-zero but 'good'. * Example: PRE-FETCH command returns SAM_STAT_CONDITION_MET when * it is able to fit nominated LBs in its cache (and SAM_STAT_GOOD * if it can't fit). Treat SAM_STAT_CONDITION_MET and the related * intermediate statuses (both obsolete in SAM-4) as good. */ if ((result & 0xff) && scsi_status_is_good(result)) { result = 0; *blk_statp = BLK_STS_OK; } return result; } /** * scsi_io_completion - Completion processing for SCSI commands. * @cmd: command that is finished. * @good_bytes: number of processed bytes. * * We will finish off the specified number of sectors. If we are done, the * command block will be released and the queue function will be goosed. If we * are not done then we have to figure out what to do next: * * a) We can call scsi_mq_requeue_cmd(). The request will be * unprepared and put back on the queue. Then a new command will * be created for it. This should be used if we made forward * progress, or if we want to switch from READ(10) to READ(6) for * example. * * b) We can call scsi_io_completion_action(). The request will be * put back on the queue and retried using the same command as * before, possibly after a delay. * * c) We can call scsi_end_request() with blk_stat other than * BLK_STS_OK, to fail the remainder of the request. */ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) { int result = cmd->result; struct request *req = scsi_cmd_to_rq(cmd); blk_status_t blk_stat = BLK_STS_OK; if (unlikely(result)) /* a nz result may or may not be an error */ result = scsi_io_completion_nz_result(cmd, result, &blk_stat); /* * Next deal with any sectors which we were able to correctly * handle. */ SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, cmd, "%u sectors total, %d bytes done.\n", blk_rq_sectors(req), good_bytes)); /* * Failed, zero length commands always need to drop down * to retry code. Fast path should return in this block. */ if (likely(blk_rq_bytes(req) > 0 || blk_stat == BLK_STS_OK)) { if (likely(!scsi_end_request(req, blk_stat, good_bytes))) return; /* no bytes remaining */ } /* Kill remainder if no retries. */ if (unlikely(blk_stat && scsi_noretry_cmd(cmd))) { if (scsi_end_request(req, blk_stat, blk_rq_bytes(req))) WARN_ONCE(true, "Bytes remaining after failed, no-retry command"); return; } /* * If there had been no error, but we have leftover bytes in the * request just queue the command up again. */ if (likely(result == 0)) scsi_mq_requeue_cmd(cmd, 0); else scsi_io_completion_action(cmd, result); } static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev, struct request *rq) { return sdev->dma_drain_len && blk_rq_is_passthrough(rq) && !op_is_write(req_op(rq)) && sdev->host->hostt->dma_need_drain(rq); } /** * scsi_alloc_sgtables - Allocate and initialize data and integrity scatterlists * @cmd: SCSI command data structure to initialize. * * Initializes @cmd->sdb and also @cmd->prot_sdb if data integrity is enabled * for @cmd. * * Returns: * * BLK_STS_OK - on success * * BLK_STS_RESOURCE - if the failure is retryable * * BLK_STS_IOERR - if the failure is fatal */ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); unsigned short nr_segs = blk_rq_nr_phys_segments(rq); struct scatterlist *last_sg = NULL; blk_status_t ret; bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq); int count; if (WARN_ON_ONCE(!nr_segs)) return BLK_STS_IOERR; /* * Make sure there is space for the drain. The driver must adjust * max_hw_segments to be prepared for this. */ if (need_drain) nr_segs++; /* * If sg table allocation fails, requeue request later. */ if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs, cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT))) return BLK_STS_RESOURCE; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); if (blk_rq_bytes(rq) & rq->q->limits.dma_pad_mask) { unsigned int pad_len = (rq->q->limits.dma_pad_mask & ~blk_rq_bytes(rq)) + 1; last_sg->length += pad_len; cmd->extra_len += pad_len; } if (need_drain) { sg_unmark_end(last_sg); last_sg = sg_next(last_sg); sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); sg_mark_end(last_sg); cmd->extra_len += sdev->dma_drain_len; count++; } BUG_ON(count > cmd->sdb.table.nents); cmd->sdb.table.nents = count; cmd->sdb.length = blk_rq_payload_bytes(rq); if (blk_integrity_rq(rq)) { struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; int ivecs; if (WARN_ON_ONCE(!prot_sdb)) { /* * This can happen if someone (e.g. multipath) * queues a command to a device on an adapter * that does not support DIX. */ ret = BLK_STS_IOERR; goto out_free_sgtables; } ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); if (sg_alloc_table_chained(&prot_sdb->table, ivecs, prot_sdb->table.sgl, SCSI_INLINE_PROT_SG_CNT)) { ret = BLK_STS_RESOURCE; goto out_free_sgtables; } count = blk_rq_map_integrity_sg(rq->q, rq->bio, prot_sdb->table.sgl); BUG_ON(count > ivecs); BUG_ON(count > queue_max_integrity_segments(rq->q)); cmd->prot_sdb = prot_sdb; cmd->prot_sdb->table.nents = count; } return BLK_STS_OK; out_free_sgtables: scsi_free_sgtables(cmd); return ret; } EXPORT_SYMBOL(scsi_alloc_sgtables); /** * scsi_initialize_rq - initialize struct scsi_cmnd partially * @rq: Request associated with the SCSI command to be initialized. * * This function initializes the members of struct scsi_cmnd that must be * initialized before request processing starts and that won't be * reinitialized if a SCSI command is requeued. */ static void scsi_initialize_rq(struct request *rq) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); memset(cmd->cmnd, 0, sizeof(cmd->cmnd)); cmd->cmd_len = MAX_COMMAND_SIZE; cmd->sense_len = 0; init_rcu_head(&cmd->rcu); cmd->jiffies_at_alloc = jiffies; cmd->retries = 0; } struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct request *rq; rq = blk_mq_alloc_request(q, opf, flags); if (!IS_ERR(rq)) scsi_initialize_rq(rq); return rq; } EXPORT_SYMBOL_GPL(scsi_alloc_request); /* * Only called when the request isn't completed by SCSI, and not freed by * SCSI */ static void scsi_cleanup_rq(struct request *rq) { if (rq->rq_flags & RQF_DONTPREP) { scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq)); rq->rq_flags &= ~RQF_DONTPREP; } } /* Called before a request is prepared. See also scsi_mq_prep_fn(). */ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); if (!blk_rq_is_passthrough(rq) && !(cmd->flags & SCMD_INITIALIZED)) { cmd->flags |= SCMD_INITIALIZED; scsi_initialize_rq(rq); } cmd->device = dev; INIT_LIST_HEAD(&cmd->eh_entry); INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler); } static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); /* * Passthrough requests may transfer data, in which case they must * a bio attached to them. Or they might contain a SCSI command * that does not transfer data, in which case they may optionally * submit a request without an attached bio. */ if (req->bio) { blk_status_t ret = scsi_alloc_sgtables(cmd); if (unlikely(ret != BLK_STS_OK)) return ret; } else { BUG_ON(blk_rq_bytes(req)); memset(&cmd->sdb, 0, sizeof(cmd->sdb)); } cmd->transfersize = blk_rq_bytes(req); return BLK_STS_OK; } static blk_status_t scsi_device_state_check(struct scsi_device *sdev, struct request *req) { switch (sdev->sdev_state) { case SDEV_CREATED: return BLK_STS_OK; case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: /* * If the device is offline we refuse to process any * commands. The device must be brought online * before trying any recovery commands. */ if (!sdev->offline_already) { sdev->offline_already = true; sdev_printk(KERN_ERR, sdev, "rejecting I/O to offline device\n"); } return BLK_STS_IOERR; case SDEV_DEL: /* * If the device is fully deleted, we refuse to * process any commands as well. */ sdev_printk(KERN_ERR, sdev, "rejecting I/O to dead device\n"); return BLK_STS_IOERR; case SDEV_BLOCK: case SDEV_CREATED_BLOCK: return BLK_STS_RESOURCE; case SDEV_QUIESCE: /* * If the device is blocked we only accept power management * commands. */ if (req && WARN_ON_ONCE(!(req->rq_flags & RQF_PM))) return BLK_STS_RESOURCE; return BLK_STS_OK; default: /* * For any other not fully online state we only allow * power management commands. */ if (req && !(req->rq_flags & RQF_PM)) return BLK_STS_OFFLINE; return BLK_STS_OK; } } /* * scsi_dev_queue_ready: if we can send requests to sdev, assign one token * and return the token else return -1. */ static inline int scsi_dev_queue_ready(struct request_queue *q, struct scsi_device *sdev) { int token; token = sbitmap_get(&sdev->budget_map); if (token < 0) return -1; if (!atomic_read(&sdev->device_blocked)) return token; /* * Only unblock if no other commands are pending and * if device_blocked has decreased to zero */ if (scsi_device_busy(sdev) > 1 || atomic_dec_return(&sdev->device_blocked) > 0) { sbitmap_put(&sdev->budget_map, token); return -1; } SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev, "unblocking device at zero depth\n")); return token; } /* * scsi_target_queue_ready: checks if there we can send commands to target * @sdev: scsi device on starget to check. */ static inline int scsi_target_queue_ready(struct Scsi_Host *shost, struct scsi_device *sdev) { struct scsi_target *starget = scsi_target(sdev); unsigned int busy; if (starget->single_lun) { spin_lock_irq(shost->host_lock); if (starget->starget_sdev_user && starget->starget_sdev_user != sdev) { spin_unlock_irq(shost->host_lock); return 0; } starget->starget_sdev_user = sdev; spin_unlock_irq(shost->host_lock); } if (starget->can_queue <= 0) return 1; busy = atomic_inc_return(&starget->target_busy) - 1; if (atomic_read(&starget->target_blocked) > 0) { if (busy) goto starved; /* * unblock after target_blocked iterates to zero */ if (atomic_dec_return(&starget->target_blocked) > 0) goto out_dec; SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget, "unblocking target at zero depth\n")); } if (busy >= starget->can_queue) goto starved; return 1; starved: spin_lock_irq(shost->host_lock); list_move_tail(&sdev->starved_entry, &shost->starved_list); spin_unlock_irq(shost->host_lock); out_dec: if (starget->can_queue > 0) atomic_dec(&starget->target_busy); return 0; } /* * scsi_host_queue_ready: if we can send requests to shost, return 1 else * return 0. We must end up running the queue again whenever 0 is * returned, else IO can hang. */ static inline int scsi_host_queue_ready(struct request_queue *q, struct Scsi_Host *shost, struct scsi_device *sdev, struct scsi_cmnd *cmd) { if (atomic_read(&shost->host_blocked) > 0) { if (scsi_host_busy(shost) > 0) goto starved; /* * unblock after host_blocked iterates to zero */ if (atomic_dec_return(&shost->host_blocked) > 0) goto out_dec; SCSI_LOG_MLQUEUE(3, shost_printk(KERN_INFO, shost, "unblocking host at zero depth\n")); } if (shost->host_self_blocked) goto starved; /* We're OK to process the command, so we can't be starved */ if (!list_empty(&sdev->starved_entry)) { spin_lock_irq(shost->host_lock); if (!list_empty(&sdev->starved_entry)) list_del_init(&sdev->starved_entry); spin_unlock_irq(shost->host_lock); } __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); return 1; starved: spin_lock_irq(shost->host_lock); if (list_empty(&sdev->starved_entry)) list_add_tail(&sdev->starved_entry, &shost->starved_list); spin_unlock_irq(shost->host_lock); out_dec: scsi_dec_host_busy(shost, cmd); return 0; } /* * Busy state exporting function for request stacking drivers. * * For efficiency, no lock is taken to check the busy state of * shost/starget/sdev, since the returned value is not guaranteed and * may be changed after request stacking drivers call the function, * regardless of taking lock or not. * * When scsi can't dispatch I/Os anymore and needs to kill I/Os scsi * needs to return 'not busy'. Otherwise, request stacking drivers * may hold requests forever. */ static bool scsi_mq_lld_busy(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost; if (blk_queue_dying(q)) return false; shost = sdev->host; /* * Ignore host/starget busy state. * Since block layer does not have a concept of fairness across * multiple queues, congestion of host/starget needs to be handled * in SCSI layer. */ if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev)) return true; return false; } /* * Block layer request completion callback. May be called from interrupt * context. */ static void scsi_complete(struct request *rq) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); enum scsi_disposition disposition; INIT_LIST_HEAD(&cmd->eh_entry); atomic_inc(&cmd->device->iodone_cnt); if (cmd->result) atomic_inc(&cmd->device->ioerr_cnt); disposition = scsi_decide_disposition(cmd); if (disposition != SUCCESS && scsi_cmd_runtime_exceeced(cmd)) disposition = SUCCESS; scsi_log_completion(cmd, disposition); switch (disposition) { case SUCCESS: scsi_finish_command(cmd); break; case NEEDS_RETRY: scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY); break; case ADD_TO_MLQUEUE: scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY); break; default: scsi_eh_scmd_add(cmd); break; } } /** * scsi_dispatch_cmd - Dispatch a command to the low-level driver. * @cmd: command block we are dispatching. * * Return: nonzero return request was rejected and device's queue needs to be * plugged. */ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd) { struct Scsi_Host *host = cmd->device->host; int rtn = 0; atomic_inc(&cmd->device->iorequest_cnt); /* check if the device is still usable */ if (unlikely(cmd->device->sdev_state == SDEV_DEL)) { /* in SDEV_DEL we error all commands. DID_NO_CONNECT * returns an immediate error upwards, and signals * that the device is no longer present */ cmd->result = DID_NO_CONNECT << 16; goto done; } /* Check to see if the scsi lld made this device blocked. */ if (unlikely(scsi_device_blocked(cmd->device))) { /* * in blocked state, the command is just put back on * the device queue. The suspend state has already * blocked the queue so future requests should not * occur until the device transitions out of the * suspend state. */ SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : device blocked\n")); atomic_dec(&cmd->device->iorequest_cnt); return SCSI_MLQUEUE_DEVICE_BUSY; } /* Store the LUN value in cmnd, if needed. */ if (cmd->device->lun_in_cdb) cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) | (cmd->device->lun << 5 & 0xe0); scsi_log_send(cmd); /* * Before we queue this command, check if the command * length exceeds what the host adapter can handle. */ if (cmd->cmd_len > cmd->device->host->max_cmd_len) { SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : command too long. " "cdb_size=%d host->max_cmd_len=%d\n", cmd->cmd_len, cmd->device->host->max_cmd_len)); cmd->result = (DID_ABORT << 16); goto done; } if (unlikely(host->shost_state == SHOST_DEL)) { cmd->result = (DID_NO_CONNECT << 16); goto done; } trace_scsi_dispatch_cmd_start(cmd); rtn = host->hostt->queuecommand(host, cmd); if (rtn) { atomic_dec(&cmd->device->iorequest_cnt); trace_scsi_dispatch_cmd_error(cmd, rtn); if (rtn != SCSI_MLQUEUE_DEVICE_BUSY && rtn != SCSI_MLQUEUE_TARGET_BUSY) rtn = SCSI_MLQUEUE_HOST_BUSY; SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : request rejected\n")); } return rtn; done: scsi_done(cmd); return 0; } /* Size in bytes of the sg-list stored in the scsi-mq command-private data. */ static unsigned int scsi_mq_inline_sgl_size(struct Scsi_Host *shost) { return min_t(unsigned int, shost->sg_tablesize, SCSI_INLINE_SG_CNT) * sizeof(struct scatterlist); } static blk_status_t scsi_prepare_cmd(struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); struct scsi_device *sdev = req->q->queuedata; struct Scsi_Host *shost = sdev->host; bool in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state); struct scatterlist *sg; scsi_init_command(sdev, cmd); cmd->eh_eflags = 0; cmd->prot_type = 0; cmd->prot_flags = 0; cmd->submitter = 0; memset(&cmd->sdb, 0, sizeof(cmd->sdb)); cmd->underflow = 0; cmd->transfersize = 0; cmd->host_scribble = NULL; cmd->result = 0; cmd->extra_len = 0; cmd->state = 0; if (in_flight) __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); /* * Only clear the driver-private command data if the LLD does not supply * a function to initialize that data. */ if (!shost->hostt->init_cmd_priv) memset(cmd + 1, 0, shost->hostt->cmd_size); cmd->prot_op = SCSI_PROT_NORMAL; if (blk_rq_bytes(req)) cmd->sc_data_direction = rq_dma_dir(req); else cmd->sc_data_direction = DMA_NONE; sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; cmd->sdb.table.sgl = sg; if (scsi_host_get_prot(shost)) { memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer)); cmd->prot_sdb->table.sgl = (struct scatterlist *)(cmd->prot_sdb + 1); } /* * Special handling for passthrough commands, which don't go to the ULP * at all: */ if (blk_rq_is_passthrough(req)) return scsi_setup_scsi_cmnd(sdev, req); if (sdev->handler && sdev->handler->prep_fn) { blk_status_t ret = sdev->handler->prep_fn(sdev, req); if (ret != BLK_STS_OK) return ret; } /* Usually overridden by the ULP */ cmd->allowed = 0; memset(cmd->cmnd, 0, sizeof(cmd->cmnd)); return scsi_cmd_to_driver(cmd)->init_command(cmd); } static void scsi_done_internal(struct scsi_cmnd *cmd, bool complete_directly) { struct request *req = scsi_cmd_to_rq(cmd); switch (cmd->submitter) { case SUBMITTED_BY_BLOCK_LAYER: break; case SUBMITTED_BY_SCSI_ERROR_HANDLER: return scsi_eh_done(cmd); case SUBMITTED_BY_SCSI_RESET_IOCTL: return; } if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q))) return; if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state))) return; trace_scsi_dispatch_cmd_done(cmd); if (complete_directly) blk_mq_complete_request_direct(req, scsi_complete); else blk_mq_complete_request(req); } void scsi_done(struct scsi_cmnd *cmd) { scsi_done_internal(cmd, false); } EXPORT_SYMBOL(scsi_done); void scsi_done_direct(struct scsi_cmnd *cmd) { scsi_done_internal(cmd, true); } EXPORT_SYMBOL(scsi_done_direct); static void scsi_mq_put_budget(struct request_queue *q, int budget_token) { struct scsi_device *sdev = q->queuedata; sbitmap_put(&sdev->budget_map, budget_token); } /* * When to reinvoke queueing after a resource shortage. It's 3 msecs to * not change behaviour from the previous unplug mechanism, experimentation * may prove this needs changing. */ #define SCSI_QUEUE_DELAY 3 static int scsi_mq_get_budget(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; int token = scsi_dev_queue_ready(q, sdev); if (token >= 0) return token; atomic_inc(&sdev->restarts); /* * Orders atomic_inc(&sdev->restarts) and atomic_read(&sdev->device_busy). * .restarts must be incremented before .device_busy is read because the * code in scsi_run_queue_async() depends on the order of these operations. */ smp_mb__after_atomic(); /* * If all in-flight requests originated from this LUN are completed * before reading .device_busy, sdev->device_busy will be observed as * zero, then blk_mq_delay_run_hw_queues() will dispatch this request * soon. Otherwise, completion of one of these requests will observe * the .restarts flag, and the request queue will be run for handling * this request, see scsi_end_request(). */ if (unlikely(scsi_device_busy(sdev) == 0 && !scsi_device_blocked(sdev))) blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY); return -1; } static void scsi_mq_set_rq_budget_token(struct request *req, int token) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); cmd->budget_token = token; } static int scsi_mq_get_rq_budget_token(struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); return cmd->budget_token; } static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *req = bd->rq; struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); blk_status_t ret; int reason; WARN_ON_ONCE(cmd->budget_token < 0); /* * If the device is not in running state we will reject some or all * commands. */ if (unlikely(sdev->sdev_state != SDEV_RUNNING)) { ret = scsi_device_state_check(sdev, req); if (ret != BLK_STS_OK) goto out_put_budget; } ret = BLK_STS_RESOURCE; if (!scsi_target_queue_ready(shost, sdev)) goto out_put_budget; if (unlikely(scsi_host_in_recovery(shost))) { if (cmd->flags & SCMD_FAIL_IF_RECOVERING) ret = BLK_STS_OFFLINE; goto out_dec_target_busy; } if (!scsi_host_queue_ready(q, shost, sdev, cmd)) goto out_dec_target_busy; if (!(req->rq_flags & RQF_DONTPREP)) { ret = scsi_prepare_cmd(req); if (ret != BLK_STS_OK) goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { clear_bit(SCMD_STATE_COMPLETE, &cmd->state); } cmd->flags &= SCMD_PRESERVED_FLAGS; if (sdev->simple_tags) cmd->flags |= SCMD_TAGGED; if (bd->last) cmd->flags |= SCMD_LAST; scsi_set_resid(cmd, 0); memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); cmd->submitter = SUBMITTED_BY_BLOCK_LAYER; blk_mq_start_request(req); reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); ret = BLK_STS_RESOURCE; goto out_dec_host_busy; } return BLK_STS_OK; out_dec_host_busy: scsi_dec_host_busy(shost, cmd); out_dec_target_busy: if (scsi_target(sdev)->can_queue > 0) atomic_dec(&scsi_target(sdev)->target_busy); out_put_budget: scsi_mq_put_budget(q, cmd->budget_token); cmd->budget_token = -1; switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: if (scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; break; case BLK_STS_AGAIN: cmd->result = DID_BUS_BUSY << 16; if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); break; default: if (unlikely(!scsi_device_online(sdev))) cmd->result = DID_NO_CONNECT << 16; else cmd->result = DID_ERROR << 16; /* * Make sure to release all allocated resources when * we hit an error, as we will never see this command * again. */ if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); scsi_run_queue_async(sdev); break; } return ret; } static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { struct Scsi_Host *shost = set->driver_data; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); struct scatterlist *sg; int ret = 0; cmd->sense_buffer = kmem_cache_alloc_node(scsi_sense_cache, GFP_KERNEL, numa_node); if (!cmd->sense_buffer) return -ENOMEM; if (scsi_host_get_prot(shost)) { sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost); } if (shost->hostt->init_cmd_priv) { ret = shost->hostt->init_cmd_priv(shost, cmd); if (ret < 0) kmem_cache_free(scsi_sense_cache, cmd->sense_buffer); } return ret; } static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { struct Scsi_Host *shost = set->driver_data; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); if (shost->hostt->exit_cmd_priv) shost->hostt->exit_cmd_priv(shost, cmd); kmem_cache_free(scsi_sense_cache, cmd->sense_buffer); } static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct Scsi_Host *shost = hctx->driver_data; if (shost->hostt->mq_poll) return shost->hostt->mq_poll(shost, hctx->queue_num); return 0; } static int scsi_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct Scsi_Host *shost = data; hctx->driver_data = shost; return 0; } static void scsi_map_queues(struct blk_mq_tag_set *set) { struct Scsi_Host *shost = container_of(set, struct Scsi_Host, tag_set); if (shost->hostt->map_queues) return shost->hostt->map_queues(shost); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim) { struct device *dev = shost->dma_dev; memset(lim, 0, sizeof(*lim)); lim->max_segments = min_t(unsigned short, shost->sg_tablesize, SG_MAX_SEGMENTS); if (scsi_host_prot_dma(shost)) { shost->sg_prot_tablesize = min_not_zero(shost->sg_prot_tablesize, (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS); BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize); lim->max_integrity_segments = shost->sg_prot_tablesize; } lim->max_hw_sectors = shost->max_sectors; lim->seg_boundary_mask = shost->dma_boundary; lim->max_segment_size = shost->max_segment_size; lim->virt_boundary_mask = shost->virt_boundary_mask; lim->dma_alignment = max_t(unsigned int, shost->dma_alignment, dma_get_cache_alignment() - 1); if (shost->no_highmem) lim->features |= BLK_FEAT_BOUNCE_HIGH; dma_set_seg_boundary(dev, shost->dma_boundary); dma_set_max_seg_size(dev, shost->max_segment_size); } EXPORT_SYMBOL_GPL(scsi_init_limits); static const struct blk_mq_ops scsi_mq_ops_no_commit = { .get_budget = scsi_mq_get_budget, .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, .complete = scsi_complete, .timeout = scsi_timeout, #ifdef CONFIG_BLK_DEBUG_FS .show_rq = scsi_show_rq, #endif .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, .init_hctx = scsi_init_hctx, .poll = scsi_mq_poll, .set_rq_budget_token = scsi_mq_set_rq_budget_token, .get_rq_budget_token = scsi_mq_get_rq_budget_token, }; static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx) { struct Scsi_Host *shost = hctx->driver_data; shost->hostt->commit_rqs(shost, hctx->queue_num); } static const struct blk_mq_ops scsi_mq_ops = { .get_budget = scsi_mq_get_budget, .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, .commit_rqs = scsi_commit_rqs, .complete = scsi_complete, .timeout = scsi_timeout, #ifdef CONFIG_BLK_DEBUG_FS .show_rq = scsi_show_rq, #endif .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, .init_hctx = scsi_init_hctx, .poll = scsi_mq_poll, .set_rq_budget_token = scsi_mq_set_rq_budget_token, .get_rq_budget_token = scsi_mq_get_rq_budget_token, }; int scsi_mq_setup_tags(struct Scsi_Host *shost) { unsigned int cmd_size, sgl_size; struct blk_mq_tag_set *tag_set = &shost->tag_set; sgl_size = max_t(unsigned int, sizeof(struct scatterlist), scsi_mq_inline_sgl_size(shost)); cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size; if (scsi_host_get_prot(shost)) cmd_size += sizeof(struct scsi_data_buffer) + sizeof(struct scatterlist) * SCSI_INLINE_PROT_SG_CNT; memset(tag_set, 0, sizeof(*tag_set)); if (shost->hostt->commit_rqs) tag_set->ops = &scsi_mq_ops; else tag_set->ops = &scsi_mq_ops_no_commit; tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1; tag_set->nr_maps = shost->nr_maps ? : 1; tag_set->queue_depth = shost->can_queue; tag_set->cmd_size = cmd_size; tag_set->numa_node = dev_to_node(shost->dma_dev); tag_set->flags = BLK_MQ_F_SHOULD_MERGE; tag_set->flags |= BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); if (shost->queuecommand_may_block) tag_set->flags |= BLK_MQ_F_BLOCKING; tag_set->driver_data = shost; if (shost->host_tagset) tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; return blk_mq_alloc_tag_set(tag_set); } void scsi_mq_free_tags(struct kref *kref) { struct Scsi_Host *shost = container_of(kref, typeof(*shost), tagset_refcnt); blk_mq_free_tag_set(&shost->tag_set); complete(&shost->tagset_freed); } /** * scsi_device_from_queue - return sdev associated with a request_queue * @q: The request queue to return the sdev from * * Return the sdev associated with a request queue or NULL if the * request_queue does not reference a SCSI device. */ struct scsi_device *scsi_device_from_queue(struct request_queue *q) { struct scsi_device *sdev = NULL; if (q->mq_ops == &scsi_mq_ops_no_commit || q->mq_ops == &scsi_mq_ops) sdev = q->queuedata; if (!sdev || !get_device(&sdev->sdev_gendev)) sdev = NULL; return sdev; } /* * pktcdvd should have been integrated into the SCSI layers, but for historical * reasons like the old IDE driver it isn't. This export allows it to safely * probe if a given device is a SCSI one and only attach to that. */ #ifdef CONFIG_CDROM_PKTCDVD_MODULE EXPORT_SYMBOL_GPL(scsi_device_from_queue); #endif /** * scsi_block_requests - Utility function used by low-level drivers to prevent * further commands from being queued to the device. * @shost: host in question * * There is no timer nor any other means by which the requests get unblocked * other than the low-level driver calling scsi_unblock_requests(). */ void scsi_block_requests(struct Scsi_Host *shost) { shost->host_self_blocked = 1; } EXPORT_SYMBOL(scsi_block_requests); /** * scsi_unblock_requests - Utility function used by low-level drivers to allow * further commands to be queued to the device. * @shost: host in question * * There is no timer nor any other means by which the requests get unblocked * other than the low-level driver calling scsi_unblock_requests(). This is done * as an API function so that changes to the internals of the scsi mid-layer * won't require wholesale changes to drivers that use this feature. */ void scsi_unblock_requests(struct Scsi_Host *shost) { shost->host_self_blocked = 0; scsi_run_host_queues(shost); } EXPORT_SYMBOL(scsi_unblock_requests); void scsi_exit_queue(void) { kmem_cache_destroy(scsi_sense_cache); } /** * scsi_mode_select - issue a mode select * @sdev: SCSI device to be queried * @pf: Page format bit (1 == standard, 0 == vendor specific) * @sp: Save page bit (0 == don't save, 1 == save) * @buffer: request buffer (may not be smaller than eight bytes) * @len: length of request buffer. * @timeout: command timeout * @retries: number of retries before failing * @data: returns a structure abstracting the mode header data * @sshdr: place to put sense data (or NULL if no sense to be collected). * must be SCSI_SENSE_BUFFERSIZE big. * * Returns zero if successful; negative error number or scsi * status on error * */ int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { unsigned char cmd[10]; unsigned char *real_buffer; const struct scsi_exec_args exec_args = { .sshdr = sshdr, }; int ret; memset(cmd, 0, sizeof(cmd)); cmd[1] = (pf ? 0x10 : 0) | (sp ? 0x01 : 0); /* * Use MODE SELECT(10) if the device asked for it or if the mode page * and the mode select header cannot fit within the maximumm 255 bytes * of the MODE SELECT(6) command. */ if (sdev->use_10_for_ms || len + 4 > 255 || data->block_descriptor_length > 255) { if (len > 65535 - 8) return -EINVAL; real_buffer = kmalloc(8 + len, GFP_KERNEL); if (!real_buffer) return -ENOMEM; memcpy(real_buffer + 8, buffer, len); len += 8; real_buffer[0] = 0; real_buffer[1] = 0; real_buffer[2] = data->medium_type; real_buffer[3] = data->device_specific; real_buffer[4] = data->longlba ? 0x01 : 0; real_buffer[5] = 0; put_unaligned_be16(data->block_descriptor_length, &real_buffer[6]); cmd[0] = MODE_SELECT_10; put_unaligned_be16(len, &cmd[7]); } else { if (data->longlba) return -EINVAL; real_buffer = kmalloc(4 + len, GFP_KERNEL); if (!real_buffer) return -ENOMEM; memcpy(real_buffer + 4, buffer, len); len += 4; real_buffer[0] = 0; real_buffer[1] = data->medium_type; real_buffer[2] = data->device_specific; real_buffer[3] = data->block_descriptor_length; cmd[0] = MODE_SELECT; cmd[4] = len; } ret = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, real_buffer, len, timeout, retries, &exec_args); kfree(real_buffer); return ret; } EXPORT_SYMBOL_GPL(scsi_mode_select); /** * scsi_mode_sense - issue a mode sense, falling back from 10 to six bytes if necessary. * @sdev: SCSI device to be queried * @dbd: set to prevent mode sense from returning block descriptors * @modepage: mode page being requested * @subpage: sub-page of the mode page being requested * @buffer: request buffer (may not be smaller than eight bytes) * @len: length of request buffer. * @timeout: command timeout * @retries: number of retries before failing * @data: returns a structure abstracting the mode header data * @sshdr: place to put sense data (or NULL if no sense to be collected). * must be SCSI_SENSE_BUFFERSIZE big. * * Returns zero if successful, or a negative error number on failure */ int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, int subpage, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { unsigned char cmd[12]; int use_10_for_ms; int header_length; int result; struct scsi_sense_hdr my_sshdr; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = retries, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { /* caller might not be interested in sense, but we need it */ .sshdr = sshdr ? : &my_sshdr, .failures = &failures, }; memset(data, 0, sizeof(*data)); memset(&cmd[0], 0, 12); dbd = sdev->set_dbd_for_ms ? 8 : dbd; cmd[1] = dbd & 0x18; /* allows DBD and LLBA bits */ cmd[2] = modepage; cmd[3] = subpage; sshdr = exec_args.sshdr; retry: use_10_for_ms = sdev->use_10_for_ms || len > 255; if (use_10_for_ms) { if (len < 8 || len > 65535) return -EINVAL; cmd[0] = MODE_SENSE_10; put_unaligned_be16(len, &cmd[7]); header_length = 8; } else { if (len < 4) return -EINVAL; cmd[0] = MODE_SENSE; cmd[4] = len; header_length = 4; } memset(buffer, 0, len); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len, timeout, retries, &exec_args); if (result < 0) return result; /* This code looks awful: what it's doing is making sure an * ILLEGAL REQUEST sense return identifies the actual command * byte as the problem. MODE_SENSE commands can return * ILLEGAL REQUEST if the code page isn't supported */ if (!scsi_status_is_good(result)) { if (scsi_sense_valid(sshdr)) { if ((sshdr->sense_key == ILLEGAL_REQUEST) && (sshdr->asc == 0x20) && (sshdr->ascq == 0)) { /* * Invalid command operation code: retry using * MODE SENSE(6) if this was a MODE SENSE(10) * request, except if the request mode page is * too large for MODE SENSE single byte * allocation length field. */ if (use_10_for_ms) { if (len > 255) return -EIO; sdev->use_10_for_ms = 0; goto retry; } } } return -EIO; } if (unlikely(buffer[0] == 0x86 && buffer[1] == 0x0b && (modepage == 6 || modepage == 8))) { /* Initio breakage? */ header_length = 0; data->length = 13; data->medium_type = 0; data->device_specific = 0; data->longlba = 0; data->block_descriptor_length = 0; } else if (use_10_for_ms) { data->length = get_unaligned_be16(&buffer[0]) + 2; data->medium_type = buffer[2]; data->device_specific = buffer[3]; data->longlba = buffer[4] & 0x01; data->block_descriptor_length = get_unaligned_be16(&buffer[6]); } else { data->length = buffer[0] + 1; data->medium_type = buffer[1]; data->device_specific = buffer[2]; data->block_descriptor_length = buffer[3]; } data->header_length = header_length; return 0; } EXPORT_SYMBOL(scsi_mode_sense); /** * scsi_test_unit_ready - test if unit is ready * @sdev: scsi device to change the state of. * @timeout: command timeout * @retries: number of retries before failing * @sshdr: outpout pointer for decoded sense information. * * Returns zero if unsuccessful or an error if TUR failed. For * removable media, UNIT_ATTENTION sets ->changed flag. **/ int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, struct scsi_sense_hdr *sshdr) { char cmd[] = { TEST_UNIT_READY, 0, 0, 0, 0, 0, }; const struct scsi_exec_args exec_args = { .sshdr = sshdr, }; int result; /* try to eat the UNIT_ATTENTION if there are enough retries */ do { result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, NULL, 0, timeout, 1, &exec_args); if (sdev->removable && result > 0 && scsi_sense_valid(sshdr) && sshdr->sense_key == UNIT_ATTENTION) sdev->changed = 1; } while (result > 0 && scsi_sense_valid(sshdr) && sshdr->sense_key == UNIT_ATTENTION && --retries); return result; } EXPORT_SYMBOL(scsi_test_unit_ready); /** * scsi_device_set_state - Take the given device through the device state model. * @sdev: scsi device to change the state of. * @state: state to change to. * * Returns zero if successful or an error if the requested * transition is illegal. */ int scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state) { enum scsi_device_state oldstate = sdev->sdev_state; if (state == oldstate) return 0; switch (state) { case SDEV_CREATED: switch (oldstate) { case SDEV_CREATED_BLOCK: break; default: goto illegal; } break; case SDEV_RUNNING: switch (oldstate) { case SDEV_CREATED: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: case SDEV_QUIESCE: case SDEV_BLOCK: break; default: goto illegal; } break; case SDEV_QUIESCE: switch (oldstate) { case SDEV_RUNNING: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: break; default: goto illegal; } break; case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_QUIESCE: case SDEV_BLOCK: break; default: goto illegal; } break; case SDEV_BLOCK: switch (oldstate) { case SDEV_RUNNING: case SDEV_CREATED_BLOCK: case SDEV_QUIESCE: case SDEV_OFFLINE: break; default: goto illegal; } break; case SDEV_CREATED_BLOCK: switch (oldstate) { case SDEV_CREATED: break; default: goto illegal; } break; case SDEV_CANCEL: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_QUIESCE: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: break; default: goto illegal; } break; case SDEV_DEL: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: case SDEV_CANCEL: case SDEV_BLOCK: case SDEV_CREATED_BLOCK: break; default: goto illegal; } break; } sdev->offline_already = false; sdev->sdev_state = state; return 0; illegal: SCSI_LOG_ERROR_RECOVERY(1, sdev_printk(KERN_ERR, sdev, "Illegal state transition %s->%s", scsi_device_state_name(oldstate), scsi_device_state_name(state)) ); return -EINVAL; } EXPORT_SYMBOL(scsi_device_set_state); /** * scsi_evt_emit - emit a single SCSI device uevent * @sdev: associated SCSI device * @evt: event to emit * * Send a single uevent (scsi_event) to the associated scsi_device. */ static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt) { int idx = 0; char *envp[3]; switch (evt->evt_type) { case SDEV_EVT_MEDIA_CHANGE: envp[idx++] = "SDEV_MEDIA_CHANGE=1"; break; case SDEV_EVT_INQUIRY_CHANGE_REPORTED: scsi_rescan_device(sdev); envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED"; break; case SDEV_EVT_CAPACITY_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED"; break; case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED: envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED"; break; case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED"; break; case SDEV_EVT_LUN_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED"; break; case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED"; break; case SDEV_EVT_POWER_ON_RESET_OCCURRED: envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED"; break; default: /* do nothing */ break; } envp[idx++] = NULL; kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp); } /** * scsi_evt_thread - send a uevent for each scsi event * @work: work struct for scsi_device * * Dispatch queued events to their associated scsi_device kobjects * as uevents. */ void scsi_evt_thread(struct work_struct *work) { struct scsi_device *sdev; enum scsi_device_event evt_type; LIST_HEAD(event_list); sdev = container_of(work, struct scsi_device, event_work); for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++) if (test_and_clear_bit(evt_type, sdev->pending_events)) sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL); while (1) { struct scsi_event *evt; struct list_head *this, *tmp; unsigned long flags; spin_lock_irqsave(&sdev->list_lock, flags); list_splice_init(&sdev->event_list, &event_list); spin_unlock_irqrestore(&sdev->list_lock, flags); if (list_empty(&event_list)) break; list_for_each_safe(this, tmp, &event_list) { evt = list_entry(this, struct scsi_event, node); list_del(&evt->node); scsi_evt_emit(sdev, evt); kfree(evt); } } } /** * sdev_evt_send - send asserted event to uevent thread * @sdev: scsi_device event occurred on * @evt: event to send * * Assert scsi device event asynchronously. */ void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt) { unsigned long flags; #if 0 /* FIXME: currently this check eliminates all media change events * for polled devices. Need to update to discriminate between AN * and polled events */ if (!test_bit(evt->evt_type, sdev->supported_events)) { kfree(evt); return; } #endif spin_lock_irqsave(&sdev->list_lock, flags); list_add_tail(&evt->node, &sdev->event_list); schedule_work(&sdev->event_work); spin_unlock_irqrestore(&sdev->list_lock, flags); } EXPORT_SYMBOL_GPL(sdev_evt_send); /** * sdev_evt_alloc - allocate a new scsi event * @evt_type: type of event to allocate * @gfpflags: GFP flags for allocation * * Allocates and returns a new scsi_event. */ struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type, gfp_t gfpflags) { struct scsi_event *evt = kzalloc(sizeof(struct scsi_event), gfpflags); if (!evt) return NULL; evt->evt_type = evt_type; INIT_LIST_HEAD(&evt->node); /* evt_type-specific initialization, if any */ switch (evt_type) { case SDEV_EVT_MEDIA_CHANGE: case SDEV_EVT_INQUIRY_CHANGE_REPORTED: case SDEV_EVT_CAPACITY_CHANGE_REPORTED: case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED: case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED: case SDEV_EVT_LUN_CHANGE_REPORTED: case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED: case SDEV_EVT_POWER_ON_RESET_OCCURRED: default: /* do nothing */ break; } return evt; } EXPORT_SYMBOL_GPL(sdev_evt_alloc); /** * sdev_evt_send_simple - send asserted event to uevent thread * @sdev: scsi_device event occurred on * @evt_type: type of event to send * @gfpflags: GFP flags for allocation * * Assert scsi device event asynchronously, given an event type. */ void sdev_evt_send_simple(struct scsi_device *sdev, enum scsi_device_event evt_type, gfp_t gfpflags) { struct scsi_event *evt = sdev_evt_alloc(evt_type, gfpflags); if (!evt) { sdev_printk(KERN_ERR, sdev, "event %d eaten due to OOM\n", evt_type); return; } sdev_evt_send(sdev, evt); } EXPORT_SYMBOL_GPL(sdev_evt_send_simple); /** * scsi_device_quiesce - Block all commands except power management. * @sdev: scsi device to quiesce. * * This works by trying to transition to the SDEV_QUIESCE state * (which must be a legal transition). When the device is in this * state, only power management requests will be accepted, all others will * be deferred. * * Must be called with user context, may sleep. * * Returns zero if unsuccessful or an error if not. */ int scsi_device_quiesce(struct scsi_device *sdev) { struct request_queue *q = sdev->request_queue; int err; /* * It is allowed to call scsi_device_quiesce() multiple times from * the same context but concurrent scsi_device_quiesce() calls are * not allowed. */ WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); if (sdev->quiesced_by == current) return 0; blk_set_pm_only(q); blk_mq_freeze_queue(q); /* * Ensure that the effect of blk_set_pm_only() will be visible * for percpu_ref_tryget() callers that occur after the queue * unfreeze even if the queue was already frozen before this function * was called. See also https://lwn.net/Articles/573497/. */ synchronize_rcu(); blk_mq_unfreeze_queue(q); mutex_lock(&sdev->state_mutex); err = scsi_device_set_state(sdev, SDEV_QUIESCE); if (err == 0) sdev->quiesced_by = current; else blk_clear_pm_only(q); mutex_unlock(&sdev->state_mutex); return err; } EXPORT_SYMBOL(scsi_device_quiesce); /** * scsi_device_resume - Restart user issued commands to a quiesced device. * @sdev: scsi device to resume. * * Moves the device from quiesced back to running and restarts the * queues. * * Must be called with user context, may sleep. */ void scsi_device_resume(struct scsi_device *sdev) { /* check if the device state was mutated prior to resume, and if * so assume the state is being managed elsewhere (for example * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); if (sdev->sdev_state == SDEV_QUIESCE) scsi_device_set_state(sdev, SDEV_RUNNING); if (sdev->quiesced_by) { sdev->quiesced_by = NULL; blk_clear_pm_only(sdev->request_queue); } mutex_unlock(&sdev->state_mutex); } EXPORT_SYMBOL(scsi_device_resume); static void device_quiesce_fn(struct scsi_device *sdev, void *data) { scsi_device_quiesce(sdev); } void scsi_target_quiesce(struct scsi_target *starget) { starget_for_each_device(starget, NULL, device_quiesce_fn); } EXPORT_SYMBOL(scsi_target_quiesce); static void device_resume_fn(struct scsi_device *sdev, void *data) { scsi_device_resume(sdev); } void scsi_target_resume(struct scsi_target *starget) { starget_for_each_device(starget, NULL, device_resume_fn); } EXPORT_SYMBOL(scsi_target_resume); static int __scsi_internal_device_block_nowait(struct scsi_device *sdev) { if (scsi_device_set_state(sdev, SDEV_BLOCK)) return scsi_device_set_state(sdev, SDEV_CREATED_BLOCK); return 0; } void scsi_start_queue(struct scsi_device *sdev) { if (cmpxchg(&sdev->queue_stopped, 1, 0)) blk_mq_unquiesce_queue(sdev->request_queue); } static void scsi_stop_queue(struct scsi_device *sdev) { /* * The atomic variable of ->queue_stopped covers that * blk_mq_quiesce_queue* is balanced with blk_mq_unquiesce_queue. * * The caller needs to wait until quiesce is done. */ if (!cmpxchg(&sdev->queue_stopped, 0, 1)) blk_mq_quiesce_queue_nowait(sdev->request_queue); } /** * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state * @sdev: device to block * * Pause SCSI command processing on the specified device. Does not sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_BLOCK state (which must be * a legal transition). When the device is in this state, command processing * is paused until the device leaves the SDEV_BLOCK state. See also * scsi_internal_device_unblock_nowait(). */ int scsi_internal_device_block_nowait(struct scsi_device *sdev) { int ret = __scsi_internal_device_block_nowait(sdev); /* * The device has transitioned to SDEV_BLOCK. Stop the * block layer from calling the midlayer with this device's * request queue. */ if (!ret) scsi_stop_queue(sdev); return ret; } EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); /** * scsi_device_block - try to transition to the SDEV_BLOCK state * @sdev: device to block * @data: dummy argument, ignored * * Pause SCSI command processing on the specified device. Callers must wait * until all ongoing scsi_queue_rq() calls have finished after this function * returns. * * Note: * This routine transitions the device to the SDEV_BLOCK state (which must be * a legal transition). When the device is in this state, command processing * is paused until the device leaves the SDEV_BLOCK state. See also * scsi_internal_device_unblock(). */ static void scsi_device_block(struct scsi_device *sdev, void *data) { int err; enum scsi_device_state state; mutex_lock(&sdev->state_mutex); err = __scsi_internal_device_block_nowait(sdev); state = sdev->sdev_state; if (err == 0) /* * scsi_stop_queue() must be called with the state_mutex * held. Otherwise a simultaneous scsi_start_queue() call * might unquiesce the queue before we quiesce it. */ scsi_stop_queue(sdev); mutex_unlock(&sdev->state_mutex); WARN_ONCE(err, "%s: failed to block %s in state %d\n", __func__, dev_name(&sdev->sdev_gendev), state); } /** * scsi_internal_device_unblock_nowait - resume a device after a block request * @sdev: device to resume * @new_state: state to set the device to after unblocking * * Restart the device queue for a previously suspended SCSI device. Does not * sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_RUNNING state or to one of * the offline states (which must be a legal transition) allowing the midlayer * to goose the queue for this device. */ int scsi_internal_device_unblock_nowait(struct scsi_device *sdev, enum scsi_device_state new_state) { switch (new_state) { case SDEV_RUNNING: case SDEV_TRANSPORT_OFFLINE: break; default: return -EINVAL; } /* * Try to transition the scsi device to SDEV_RUNNING or one of the * offlined states and goose the device queue if successful. */ switch (sdev->sdev_state) { case SDEV_BLOCK: case SDEV_TRANSPORT_OFFLINE: sdev->sdev_state = new_state; break; case SDEV_CREATED_BLOCK: if (new_state == SDEV_TRANSPORT_OFFLINE || new_state == SDEV_OFFLINE) sdev->sdev_state = new_state; else sdev->sdev_state = SDEV_CREATED; break; case SDEV_CANCEL: case SDEV_OFFLINE: break; default: return -EINVAL; } scsi_start_queue(sdev); return 0; } EXPORT_SYMBOL_GPL(scsi_internal_device_unblock_nowait); /** * scsi_internal_device_unblock - resume a device after a block request * @sdev: device to resume * @new_state: state to set the device to after unblocking * * Restart the device queue for a previously suspended SCSI device. May sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_RUNNING state or to one of * the offline states (which must be a legal transition) allowing the midlayer * to goose the queue for this device. */ static int scsi_internal_device_unblock(struct scsi_device *sdev, enum scsi_device_state new_state) { int ret; mutex_lock(&sdev->state_mutex); ret = scsi_internal_device_unblock_nowait(sdev, new_state); mutex_unlock(&sdev->state_mutex); return ret; } static int target_block(struct device *dev, void *data) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), NULL, scsi_device_block); return 0; } /** * scsi_block_targets - transition all SCSI child devices to SDEV_BLOCK state * @dev: a parent device of one or more scsi_target devices * @shost: the Scsi_Host to which this device belongs * * Iterate over all children of @dev, which should be scsi_target devices, * and switch all subordinate scsi devices to SDEV_BLOCK state. Wait for * ongoing scsi_queue_rq() calls to finish. May sleep. * * Note: * @dev must not itself be a scsi_target device. */ void scsi_block_targets(struct Scsi_Host *shost, struct device *dev) { WARN_ON_ONCE(scsi_is_target_device(dev)); device_for_each_child(dev, NULL, target_block); blk_mq_wait_quiesce_done(&shost->tag_set); } EXPORT_SYMBOL_GPL(scsi_block_targets); static void device_unblock(struct scsi_device *sdev, void *data) { scsi_internal_device_unblock(sdev, *(enum scsi_device_state *)data); } static int target_unblock(struct device *dev, void *data) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), data, device_unblock); return 0; } void scsi_target_unblock(struct device *dev, enum scsi_device_state new_state) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), &new_state, device_unblock); else device_for_each_child(dev, &new_state, target_unblock); } EXPORT_SYMBOL_GPL(scsi_target_unblock); /** * scsi_host_block - Try to transition all logical units to the SDEV_BLOCK state * @shost: device to block * * Pause SCSI command processing for all logical units associated with the SCSI * host and wait until pending scsi_queue_rq() calls have finished. * * Returns zero if successful or a negative error code upon failure. */ int scsi_host_block(struct Scsi_Host *shost) { struct scsi_device *sdev; int ret; /* * Call scsi_internal_device_block_nowait so we can avoid * calling synchronize_rcu() for each LUN. */ shost_for_each_device(sdev, shost) { mutex_lock(&sdev->state_mutex); ret = scsi_internal_device_block_nowait(sdev); mutex_unlock(&sdev->state_mutex); if (ret) { scsi_device_put(sdev); return ret; } } /* Wait for ongoing scsi_queue_rq() calls to finish. */ blk_mq_wait_quiesce_done(&shost->tag_set); return 0; } EXPORT_SYMBOL_GPL(scsi_host_block); int scsi_host_unblock(struct Scsi_Host *shost, int new_state) { struct scsi_device *sdev; int ret = 0; shost_for_each_device(sdev, shost) { ret = scsi_internal_device_unblock(sdev, new_state); if (ret) { scsi_device_put(sdev); break; } } return ret; } EXPORT_SYMBOL_GPL(scsi_host_unblock); /** * scsi_kmap_atomic_sg - find and atomically map an sg-elemnt * @sgl: scatter-gather list * @sg_count: number of segments in sg * @offset: offset in bytes into sg, on return offset into the mapped area * @len: bytes to map, on return number of bytes mapped * * Returns virtual address of the start of the mapped page */ void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count, size_t *offset, size_t *len) { int i; size_t sg_len = 0, len_complete = 0; struct scatterlist *sg; struct page *page; WARN_ON(!irqs_disabled()); for_each_sg(sgl, sg, sg_count, i) { len_complete = sg_len; /* Complete sg-entries */ sg_len += sg->length; if (sg_len > *offset) break; } if (unlikely(i == sg_count)) { printk(KERN_ERR "%s: Bytes in sg: %zu, requested offset %zu, " "elements %d\n", __func__, sg_len, *offset, sg_count); WARN_ON(1); return NULL; } /* Offset starting from the beginning of first page in this sg-entry */ *offset = *offset - len_complete + sg->offset; /* Assumption: contiguous pages can be accessed as "page + i" */ page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT)); *offset &= ~PAGE_MASK; /* Bytes in this sg-entry from *offset to the end of the page */ sg_len = PAGE_SIZE - *offset; if (*len > sg_len) *len = sg_len; return kmap_atomic(page); } EXPORT_SYMBOL(scsi_kmap_atomic_sg); /** * scsi_kunmap_atomic_sg - atomically unmap a virtual address, previously mapped with scsi_kmap_atomic_sg * @virt: virtual address to be unmapped */ void scsi_kunmap_atomic_sg(void *virt) { kunmap_atomic(virt); } EXPORT_SYMBOL(scsi_kunmap_atomic_sg); void sdev_disable_disk_events(struct scsi_device *sdev) { atomic_inc(&sdev->disk_events_disable_depth); } EXPORT_SYMBOL(sdev_disable_disk_events); void sdev_enable_disk_events(struct scsi_device *sdev) { if (WARN_ON_ONCE(atomic_read(&sdev->disk_events_disable_depth) <= 0)) return; atomic_dec(&sdev->disk_events_disable_depth); } EXPORT_SYMBOL(sdev_enable_disk_events); static unsigned char designator_prio(const unsigned char *d) { if (d[1] & 0x30) /* not associated with LUN */ return 0; if (d[3] == 0) /* invalid length */ return 0; /* * Order of preference for lun descriptor: * - SCSI name string * - NAA IEEE Registered Extended * - EUI-64 based 16-byte * - EUI-64 based 12-byte * - NAA IEEE Registered * - NAA IEEE Extended * - EUI-64 based 8-byte * - SCSI name string (truncated) * - T10 Vendor ID * as longer descriptors reduce the likelyhood * of identification clashes. */ switch (d[1] & 0xf) { case 8: /* SCSI name string, variable-length UTF-8 */ return 9; case 3: switch (d[4] >> 4) { case 6: /* NAA registered extended */ return 8; case 5: /* NAA registered */ return 5; case 4: /* NAA extended */ return 4; case 3: /* NAA locally assigned */ return 1; default: break; } break; case 2: switch (d[3]) { case 16: /* EUI64-based, 16 byte */ return 7; case 12: /* EUI64-based, 12 byte */ return 6; case 8: /* EUI64-based, 8 byte */ return 3; default: break; } break; case 1: /* T10 vendor ID */ return 1; default: break; } return 0; } /** * scsi_vpd_lun_id - return a unique device identification * @sdev: SCSI device * @id: buffer for the identification * @id_len: length of the buffer * * Copies a unique device identification into @id based * on the information in the VPD page 0x83 of the device. * The string will be formatted as a SCSI name string. * * Returns the length of the identification or error on failure. * If the identifier is longer than the supplied buffer the actual * identifier length is returned and the buffer is not zero-padded. */ int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len) { u8 cur_id_prio = 0; u8 cur_id_size = 0; const unsigned char *d, *cur_id_str; const struct scsi_vpd *vpd_pg83; int id_size = -EINVAL; rcu_read_lock(); vpd_pg83 = rcu_dereference(sdev->vpd_pg83); if (!vpd_pg83) { rcu_read_unlock(); return -ENXIO; } /* The id string must be at least 20 bytes + terminating NULL byte */ if (id_len < 21) { rcu_read_unlock(); return -EINVAL; } memset(id, 0, id_len); for (d = vpd_pg83->data + 4; d < vpd_pg83->data + vpd_pg83->len; d += d[3] + 4) { u8 prio = designator_prio(d); if (prio == 0 || cur_id_prio > prio) continue; switch (d[1] & 0xf) { case 0x1: /* T10 Vendor ID */ if (cur_id_size > d[3]) break; cur_id_prio = prio; cur_id_size = d[3]; if (cur_id_size + 4 > id_len) cur_id_size = id_len - 4; cur_id_str = d + 4; id_size = snprintf(id, id_len, "t10.%*pE", cur_id_size, cur_id_str); break; case 0x2: /* EUI-64 */ cur_id_prio = prio; cur_id_size = d[3]; cur_id_str = d + 4; switch (cur_id_size) { case 8: id_size = snprintf(id, id_len, "eui.%8phN", cur_id_str); break; case 12: id_size = snprintf(id, id_len, "eui.%12phN", cur_id_str); break; case 16: id_size = snprintf(id, id_len, "eui.%16phN", cur_id_str); break; default: break; } break; case 0x3: /* NAA */ cur_id_prio = prio; cur_id_size = d[3]; cur_id_str = d + 4; switch (cur_id_size) { case 8: id_size = snprintf(id, id_len, "naa.%8phN", cur_id_str); break; case 16: id_size = snprintf(id, id_len, "naa.%16phN", cur_id_str); break; default: break; } break; case 0x8: /* SCSI name string */ if (cur_id_size > d[3]) break; /* Prefer others for truncated descriptor */ if (d[3] > id_len) { prio = 2; if (cur_id_prio > prio) break; } cur_id_prio = prio; cur_id_size = id_size = d[3]; cur_id_str = d + 4; if (cur_id_size >= id_len) cur_id_size = id_len - 1; memcpy(id, cur_id_str, cur_id_size); break; default: break; } } rcu_read_unlock(); return id_size; } EXPORT_SYMBOL(scsi_vpd_lun_id); /* * scsi_vpd_tpg_id - return a target port group identifier * @sdev: SCSI device * * Returns the Target Port Group identifier from the information * froom VPD page 0x83 of the device. * * Returns the identifier or error on failure. */ int scsi_vpd_tpg_id(struct scsi_device *sdev, int *rel_id) { const unsigned char *d; const struct scsi_vpd *vpd_pg83; int group_id = -EAGAIN, rel_port = -1; rcu_read_lock(); vpd_pg83 = rcu_dereference(sdev->vpd_pg83); if (!vpd_pg83) { rcu_read_unlock(); return -ENXIO; } d = vpd_pg83->data + 4; while (d < vpd_pg83->data + vpd_pg83->len) { switch (d[1] & 0xf) { case 0x4: /* Relative target port */ rel_port = get_unaligned_be16(&d[6]); break; case 0x5: /* Target port group */ group_id = get_unaligned_be16(&d[6]); break; default: break; } d += d[3] + 4; } rcu_read_unlock(); if (group_id >= 0 && rel_id && rel_port != -1) *rel_id = rel_port; return group_id; } EXPORT_SYMBOL(scsi_vpd_tpg_id); /** * scsi_build_sense - build sense data for a command * @scmd: scsi command for which the sense should be formatted * @desc: Sense format (non-zero == descriptor format, * 0 == fixed format) * @key: Sense key * @asc: Additional sense code * @ascq: Additional sense code qualifier * **/ void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq) { scsi_build_sense_buffer(desc, scmd->sense_buffer, key, asc, ascq); scmd->result = SAM_STAT_CHECK_CONDITION; } EXPORT_SYMBOL_GPL(scsi_build_sense); #ifdef CONFIG_SCSI_LIB_KUNIT_TEST #include "scsi_lib_test.c" #endif |
| 21 14 19 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | /* * linux/fs/nls/nls_cp863.c * * Charset cp863 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00c2, 0x00e0, 0x00b6, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x2017, 0x00c0, 0x00a7, /* 0x90*/ 0x00c9, 0x00c8, 0x00ca, 0x00f4, 0x00cb, 0x00cf, 0x00fb, 0x00f9, 0x00a4, 0x00d4, 0x00dc, 0x00a2, 0x00a3, 0x00d9, 0x00db, 0x0192, /* 0xa0*/ 0x00a6, 0x00b4, 0x00f3, 0x00fa, 0x00a8, 0x00b8, 0x00b3, 0x00af, 0x00ce, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00be, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* 0xd0*/ 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* 0xe0*/ 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, /* 0xf0*/ 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x9b, 0x9c, 0x98, 0x00, 0xa0, 0x8f, /* 0xa0-0xa7 */ 0xa4, 0x00, 0x00, 0xae, 0xaa, 0x00, 0x00, 0xa7, /* 0xa8-0xaf */ 0xf8, 0xf1, 0xfd, 0xa6, 0xa1, 0xe6, 0x86, 0xfa, /* 0xb0-0xb7 */ 0xa5, 0x00, 0x00, 0xaf, 0xac, 0xab, 0xad, 0x00, /* 0xb8-0xbf */ 0x8e, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xc0-0xc7 */ 0x91, 0x90, 0x92, 0x94, 0x00, 0x00, 0xa8, 0x95, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x9d, 0x00, 0x9e, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ 0x85, 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x87, /* 0xe0-0xe7 */ 0x8a, 0x82, 0x88, 0x89, 0x00, 0x00, 0x8c, 0x8b, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xa2, 0x93, 0x00, 0x00, 0xf6, /* 0xf0-0xf7 */ 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */ 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8d, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */ }; static const unsigned char page23[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */ 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */ 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */ 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, page22, page23, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x87, 0x81, 0x82, 0x83, 0x83, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x85, 0x8f, /* 0x88-0x8f */ 0x82, 0x8a, 0x88, 0x93, 0x89, 0x8b, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x93, 0x81, 0x9b, 0x9c, 0x97, 0x96, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0x8c, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x9a, 0x90, 0x84, 0x84, 0x8e, 0x86, 0x80, /* 0x80-0x87 */ 0x92, 0x94, 0x91, 0x95, 0xa8, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x99, 0x94, 0x95, 0x9e, 0x9d, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0x00, 0x00, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp863", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp863(void) { return register_nls(&table); } static void __exit exit_nls_cp863(void) { unregister_nls(&table); } module_init(init_nls_cp863) module_exit(exit_nls_cp863) MODULE_DESCRIPTION("NLS Codepage 863 (Canadian French)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 184 183 183 184 174 175 169 167 1 167 167 183 108 59 116 88 30 98 94 40 66 59 59 67 2 111 9 101 88 3 25 10 5 88 184 1 2 101 101 88 88 55 57 7 7 1 1 1 18 28 149 27 27 10 167 5 126 37 37 27 1 145 6 142 38 160 136 36 25 46 17 152 43 107 36 118 53 128 48 79 78 1 8 22 22 22 152 152 89 10 81 88 88 1 2 4 80 6 11 74 4 159 84 79 159 159 3 159 158 35 3 31 23 19 7 19 7 21 1 48 29 19 28 20 5 15 15 15 11 11 12 9 2 3 3 27 27 23 20 1 2 26 8 22 28 35 34 1 23 5 11 11 4 2 5 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 | // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/dir.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/signal.h> #include <linux/unicode.h> #include "f2fs.h" #include "node.h" #include "acl.h" #include "xattr.h" #include <trace/events/f2fs.h> #if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif static unsigned long dir_blocks(struct inode *inode) { return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) >> PAGE_SHIFT; } static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) return BIT(level + dir_level); else return MAX_DIR_BUCKETS; } static unsigned int bucket_blocks(unsigned int level) { if (level < MAX_DIR_HASH_DEPTH / 2) return 2; else return 4; } #if IS_ENABLED(CONFIG_UNICODE) /* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { struct super_block *sb = dir->i_sb; unsigned char *buf; int len; if (IS_CASEFOLDED(dir) && !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { buf = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!buf) return -ENOMEM; len = utf8_casefold(sb->s_encoding, fname->usr_fname, buf, F2FS_NAME_LEN); if (len <= 0) { kmem_cache_free(f2fs_cf_name_slab, buf); if (sb_has_strict_encoding(sb)) return -EINVAL; /* fall back to treating name as opaque byte sequence */ return 0; } fname->cf_name.name = buf; fname->cf_name.len = len; } return 0; } void f2fs_free_casefolded_name(struct f2fs_filename *fname) { unsigned char *buf = (unsigned char *)fname->cf_name.name; if (buf) { kmem_cache_free(f2fs_cf_name_slab, buf); fname->cf_name.name = NULL; } } #endif /* CONFIG_UNICODE */ static int __f2fs_setup_filename(const struct inode *dir, const struct fscrypt_name *crypt_name, struct f2fs_filename *fname) { int err; memset(fname, 0, sizeof(*fname)); fname->usr_fname = crypt_name->usr_fname; fname->disk_name = crypt_name->disk_name; #ifdef CONFIG_FS_ENCRYPTION fname->crypto_buf = crypt_name->crypto_buf; #endif if (crypt_name->is_nokey_name) { /* hash was decoded from the no-key name */ fname->hash = cpu_to_le32(crypt_name->hash); } else { err = f2fs_init_casefolded_name(dir, fname); if (err) { f2fs_free_filename(fname); return err; } f2fs_hash_filename(dir, fname); } return 0; } /* * Prepare to search for @iname in @dir. This is similar to * fscrypt_setup_filename(), but this also handles computing the casefolded name * and the f2fs dirhash if needed, then packing all the information about this * filename up into a 'struct f2fs_filename'. */ int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct f2fs_filename *fname) { struct fscrypt_name crypt_name; int err; err = fscrypt_setup_filename(dir, iname, lookup, &crypt_name); if (err) return err; return __f2fs_setup_filename(dir, &crypt_name, fname); } /* * Prepare to look up @dentry in @dir. This is similar to * fscrypt_prepare_lookup(), but this also handles computing the casefolded name * and the f2fs dirhash if needed, then packing all the information about this * filename up into a 'struct f2fs_filename'. */ int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_filename *fname) { struct fscrypt_name crypt_name; int err; err = fscrypt_prepare_lookup(dir, dentry, &crypt_name); if (err) return err; return __f2fs_setup_filename(dir, &crypt_name, fname); } void f2fs_free_filename(struct f2fs_filename *fname) { #ifdef CONFIG_FS_ENCRYPTION kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; #endif f2fs_free_casefolded_name(fname); } static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) bidx += dir_buckets(i, dir_level) * bucket_blocks(i); bidx += idx * bucket_blocks(level); return bidx; } static struct f2fs_dir_entry *find_in_block(struct inode *dir, struct page *dentry_page, const struct f2fs_filename *fname, int *max_slots) { struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr d; dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(dir, &d, dentry_blk); return f2fs_find_target_dentry(&d, fname, max_slots); } static inline int f2fs_match_name(const struct inode *dir, const struct f2fs_filename *fname, const u8 *de_name, u32 de_name_len) { struct fscrypt_name f; #if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) return generic_ci_match(dir, fname->usr_fname, &fname->cf_name, de_name, de_name_len); #endif f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; #ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif return fscrypt_match_name(&f, de_name, de_name_len); } struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, const struct f2fs_filename *fname, int *max_slots) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; int res = 0; if (max_slots) *max_slots = 0; while (bit_pos < d->max) { if (!test_bit_le(bit_pos, d->bitmap)) { bit_pos++; max_len++; continue; } de = &d->dentry[bit_pos]; if (unlikely(!de->name_len)) { bit_pos++; continue; } if (de->hash_code == fname->hash) { res = f2fs_match_name(d->inode, fname, d->filename[bit_pos], le16_to_cpu(de->name_len)); if (res < 0) return ERR_PTR(res); if (res) goto found; } if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; found: if (max_slots && max_len > *max_slots) *max_slots = max_len; return de; } static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, const struct f2fs_filename *fname, struct page **res_page) { int s = GET_DENTRY_SLOTS(fname->disk_name.len); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; pgoff_t next_pgofs; bool room = false; int max_slots; nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, le32_to_cpu(fname->hash) % nbucket); end_block = bidx + nblock; while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; bidx = next_pgofs; continue; } else { *res_page = dentry_page; break; } } de = find_in_block(dir, dentry_page, fname, &max_slots); if (IS_ERR(de)) { *res_page = ERR_CAST(de); de = NULL; break; } else if (de) { *res_page = dentry_page; break; } if (max_slots >= s) room = true; f2fs_put_page(dentry_page, 0); bidx++; } if (!de && room && F2FS_I(dir)->chash != fname->hash) { F2FS_I(dir)->chash = fname->hash; F2FS_I(dir)->clevel = level; } return de; } struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, const struct f2fs_filename *fname, struct page **res_page) { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; unsigned int max_depth; unsigned int level; *res_page = NULL; if (f2fs_has_inline_dentry(dir)) { de = f2fs_find_in_inline_dir(dir, fname, res_page); goto out; } if (npages == 0) goto out; max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %lu: %u", dir->i_ino, max_depth); max_depth = MAX_DIR_HASH_DEPTH; f2fs_i_depth_write(dir, max_depth); } for (level = 0; level < max_depth; level++) { de = find_in_level(dir, level, fname, res_page); if (de || IS_ERR(*res_page)) break; } out: /* This is to increase the speed of f2fs_create */ if (!de) F2FS_I(dir)->task = current; return de; } /* * Find an entry in the specified directory with the wanted name. * It returns the page where the entry was found (as a parameter - res_page), * and the entry itself. Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, const struct qstr *child, struct page **res_page) { struct f2fs_dir_entry *de = NULL; struct f2fs_filename fname; int err; err = f2fs_setup_filename(dir, child, 1, &fname); if (err) { if (err == -ENOENT) *res_page = NULL; else *res_page = ERR_PTR(err); return NULL; } de = __f2fs_find_entry(dir, &fname, res_page); f2fs_free_filename(&fname); return de; } struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { return f2fs_find_entry(dir, &dotdot_name, p); } ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page) { ino_t res = 0; struct f2fs_dir_entry *de; de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); f2fs_put_page(*page, 0); } return res; } void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); de->file_type = fs_umode_to_ftype(inode->i_mode); set_page_dirty(page); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } static void init_dent_inode(struct inode *dir, struct inode *inode, const struct f2fs_filename *fname, struct page *ipage) { struct f2fs_inode *ri; if (!fname) /* tmpfile case? */ return; f2fs_wait_on_page_writeback(ipage, NODE, true, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); if (IS_ENCRYPTED(dir)) { file_set_enc_name(inode); /* * Roll-forward recovery doesn't have encryption keys available, * so it can't compute the dirhash for encrypted+casefolded * filenames. Append it to i_name if possible. Else, disable * roll-forward recovery of the dentry (i.e., make fsync'ing the * file force a checkpoint) by setting LOST_PINO. */ if (IS_CASEFOLDED(dir)) { if (fname->disk_name.len + sizeof(f2fs_hash_t) <= F2FS_NAME_LEN) put_unaligned(fname->hash, (f2fs_hash_t *) &ri->i_name[fname->disk_name.len]); else file_lost_pino(inode); } } set_page_dirty(ipage); } void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { struct fscrypt_str dot = FSTR_INIT(".", 1); struct fscrypt_str dotdot = FSTR_INIT("..", 2); /* update dirent of "." */ f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); /* update dirent of ".." */ f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); } static int make_empty_dir(struct inode *inode, struct inode *parent, struct page *page) { struct page *dentry_page; struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr d; if (f2fs_has_inline_dentry(inode)) return f2fs_make_empty_inline_dir(inode, parent, page); dentry_page = f2fs_get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_do_make_empty_dir(inode, parent, &d); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; } struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct f2fs_filename *fname, struct page *dpage) { struct page *page; int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { page = f2fs_new_inode_page(inode); if (IS_ERR(page)) return page; if (S_ISDIR(inode->i_mode)) { /* in order to handle error case */ get_page(page); err = make_empty_dir(inode, dir, page); if (err) { lock_page(page); goto put_error; } put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); if (err) goto put_error; err = f2fs_init_security(inode, dir, fname ? fname->usr_fname : NULL, page); if (err) goto put_error; if (IS_ENCRYPTED(inode)) { err = fscrypt_set_context(inode, page); if (err) goto put_error; } } else { page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } init_dent_inode(dir, inode, fname, page); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { if (!S_ISDIR(inode->i_mode)) file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. */ if (inode->i_nlink == 0) f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); f2fs_i_links_write(inode, true); } return page; put_error: clear_nlink(inode); f2fs_update_inode(inode, page); f2fs_put_page(page, 1); return ERR_PTR(err); } void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); if (inode && is_inode_flag_set(inode, FI_INC_LINK)) clear_inode_flag(inode, FI_INC_LINK); } int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) { int bit_start = 0; int zero_start, zero_end; next: zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start); if (zero_start >= max_slots) return max_slots; zero_end = find_next_bit_le(bitmap, max_slots, zero_start); if (zero_end - zero_start >= slots) return zero_start; bit_start = zero_end + 1; if (zero_end + 1 >= max_slots) return max_slots; goto next; } bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, const struct f2fs_filename *fname) { struct f2fs_dentry_ptr d; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(fname->disk_name.len); make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage)); bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); return bit_pos < d.max; } void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct fscrypt_str *name, f2fs_hash_t name_hash, unsigned int bit_pos) { struct f2fs_dir_entry *de; int slots = GET_DENTRY_SLOTS(name->len); int i; de = &d->dentry[bit_pos]; de->hash_code = name_hash; de->name_len = cpu_to_le16(name->len); memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); de->file_type = fs_umode_to_ftype(mode); for (i = 0; i < slots; i++) { __set_bit_le(bit_pos + i, (void *)d->bitmap); /* avoid wrong garbage data for readdir */ if (i) (de + i)->name_len = 0; } } int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; unsigned int level; unsigned int current_depth; unsigned long bidx, block; unsigned int nbucket, nblock; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; int slots, err = 0; level = 0; slots = GET_DENTRY_SLOTS(fname->disk_name.len); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == fname->hash) { level = F2FS_I(dir)->clevel; F2FS_I(dir)->chash = 0; } start: if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) return -ENOSPC; if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) ++current_depth; nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, (le32_to_cpu(fname->hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; f2fs_put_page(dentry_page, 1); } /* Move to next level to find the empty slot for new dentry */ ++level; goto start; add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); if (inode) { f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } } make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, bit_pos); set_page_dirty(dentry_page); if (inode) { f2fs_i_pino_write(inode, dir->i_ino); /* synchronize inode page's data from inode cache */ if (is_inode_flag_set(inode, FI_NEW_INODE)) f2fs_update_inode(inode, page); f2fs_put_page(page, 1); } f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) f2fs_up_write(&F2FS_I(inode)->i_sem); f2fs_put_page(dentry_page, 1); return err; } int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode) { int err = -EAGAIN; if (f2fs_has_inline_dentry(dir)) { /* * Should get i_xattr_sem to keep the lock order: * i_xattr_sem -> inode_page lock used by f2fs_setxattr. */ f2fs_down_read(&F2FS_I(dir)->i_xattr_sem); err = f2fs_add_inline_entry(dir, fname, inode, ino, mode); f2fs_up_read(&F2FS_I(dir)->i_xattr_sem); } if (err == -EAGAIN) err = f2fs_add_regular_entry(dir, fname, inode, ino, mode); f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } /* * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_filename fname; struct page *page = NULL; struct f2fs_dir_entry *de = NULL; int err; err = f2fs_setup_filename(dir, name, 0, &fname); if (err) return err; /* * An immature stackable filesystem shows a race condition between lookup * and create. If we have same task when doing lookup and create, it's * definitely fine as expected by VFS normally. Otherwise, let's just * verify on-disk dentry one more time, which guarantees filesystem * consistency more. */ if (current != F2FS_I(dir)->task) { de = __f2fs_find_entry(dir, &fname, &page); F2FS_I(dir)->task = NULL; } if (de) { f2fs_put_page(page, 0); err = -EEXIST; } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { err = f2fs_add_dentry(dir, &fname, inode, ino, mode); } f2fs_free_filename(&fname); return err; } int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, struct f2fs_filename *fname) { struct page *page; int err = 0; f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } f2fs_put_page(page, 1); clear_inode_flag(inode, FI_NEW_INODE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: f2fs_up_write(&F2FS_I(inode)->i_sem); return err; } void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); f2fs_down_write(&F2FS_I(inode)->i_sem); if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); inode_set_ctime_current(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { f2fs_i_links_write(inode, false); f2fs_i_size_write(inode, 0); } f2fs_up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) f2fs_add_orphan_inode(inode); else f2fs_release_orphan_inode(sbi); } /* * It only removes the dentry from the dentry page, corresponding name * entry in name page does not need to be touched during deletion. */ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); lock_page(page); f2fs_wait_on_page_writeback(page, DATA, true, true); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); set_page_dirty(page); if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); ClearPageUptodate(page); clear_page_private_all(page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); } f2fs_put_page(page, 1); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); } bool f2fs_empty_dir(struct inode *dir) { unsigned long bidx = 0; struct page *dentry_page; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; unsigned long nblock = dir_blocks(dir); if (f2fs_has_inline_dentry(dir)) return f2fs_empty_inline_dir(dir); while (bidx < nblock) { pgoff_t next_pgofs; dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { bidx = next_pgofs; continue; } else { return false; } } dentry_blk = page_address(dentry_page); if (bidx == 0) bit_pos = 2; else bit_pos = 0; bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); f2fs_put_page(dentry_page, 0); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; bidx++; } return true; } int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; bool readdir_ra = sbi->readdir_ra; bool found_valid_dirent = false; int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); if (readdir_ra) blk_start_plug(&plug); while (bit_pos < d->max) { bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); if (bit_pos >= d->max) break; de = &d->dentry[bit_pos]; if (de->name_len == 0) { if (found_valid_dirent || !bit_pos) { f2fs_warn_ratelimited(sbi, "invalid namelen(0), ino:%u, run fsck to fix.", le32_to_cpu(de->ino)); set_sbi_flag(sbi, SBI_NEED_FSCK); } bit_pos++; ctx->pos = start_pos + bit_pos; continue; } d_type = fs_ftype_to_dtype(de->file_type); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); if (unlikely(bit_pos > d->max || le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_warn(sbi, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } if (IS_ENCRYPTED(d->inode)) { int save_len = fstr->len; err = fscrypt_fname_disk_to_usr(d->inode, (u32)le32_to_cpu(de->hash_code), 0, &de_name, fstr); if (err) goto out; de_name = *fstr; fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) { err = 1; goto out; } if (readdir_ra) f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); ctx->pos = start_pos + bit_pos; found_valid_dirent = true; } out: if (readdir_ra) blk_finish_plug(&plug); return err; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); struct f2fs_dentry_block *dentry_blk = NULL; struct page *dentry_page = NULL; struct file_ra_state *ra = &file->f_ra; loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; if (IS_ENCRYPTED(inode)) { err = fscrypt_prepare_readdir(inode); if (err) goto out; err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); if (err < 0) goto out; } if (f2fs_has_inline_dentry(inode)) { err = f2fs_read_inline_dir(file, ctx, &fstr); goto out_free; } for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { pgoff_t next_pgofs; /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto out_free; } cond_resched(); /* readahead for multi pages of dir */ if (npages - n > 1 && !ra_has_index(ra, n)) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { err = 0; n = next_pgofs; continue; } else { goto out_free; } } dentry_blk = page_address(dentry_page); make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { f2fs_put_page(dentry_page, 0); break; } f2fs_put_page(dentry_page, 0); n++; } out_free: fscrypt_fname_free_buffer(&fstr); out: trace_f2fs_readdir(inode, start_pos, ctx->pos, err); return err < 0 ? err : 0; } const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, #endif }; |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peerlookup.h" #include "peer.h" #include "noise.h" static struct hlist_head *pubkey_bucket(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { /* siphash gives us a secure 64bit number based on a random key. Since * the bits are uniformly distributed, we can then mask off to get the * bits we need. */ const u64 hash = siphash(pubkey, NOISE_PUBLIC_KEY_LEN, &table->key); return &table->hashtable[hash & (HASH_SIZE(table->hashtable) - 1)]; } struct pubkey_hashtable *wg_pubkey_hashtable_alloc(void) { struct pubkey_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; get_random_bytes(&table->key, sizeof(table->key)); hash_init(table->hashtable); mutex_init(&table->lock); return table; } void wg_pubkey_hashtable_add(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_add_head_rcu(&peer->pubkey_hash, pubkey_bucket(table, peer->handshake.remote_static)); mutex_unlock(&table->lock); } void wg_pubkey_hashtable_remove(struct pubkey_hashtable *table, struct wg_peer *peer) { mutex_lock(&table->lock); hlist_del_init_rcu(&peer->pubkey_hash); mutex_unlock(&table->lock); } /* Returns a strong reference to a peer */ struct wg_peer * wg_pubkey_hashtable_lookup(struct pubkey_hashtable *table, const u8 pubkey[NOISE_PUBLIC_KEY_LEN]) { struct wg_peer *iter_peer, *peer = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_peer, pubkey_bucket(table, pubkey), pubkey_hash) { if (!memcmp(pubkey, iter_peer->handshake.remote_static, NOISE_PUBLIC_KEY_LEN)) { peer = iter_peer; break; } } peer = wg_peer_get_maybe_zero(peer); rcu_read_unlock_bh(); return peer; } static struct hlist_head *index_bucket(struct index_hashtable *table, const __le32 index) { /* Since the indices are random and thus all bits are uniformly * distributed, we can find its bucket simply by masking. */ return &table->hashtable[(__force u32)index & (HASH_SIZE(table->hashtable) - 1)]; } struct index_hashtable *wg_index_hashtable_alloc(void) { struct index_hashtable *table = kvmalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; hash_init(table->hashtable); spin_lock_init(&table->lock); return table; } /* At the moment, we limit ourselves to 2^20 total peers, which generally might * amount to 2^20*3 items in this hashtable. The algorithm below works by * picking a random number and testing it. We can see that these limits mean we * usually succeed pretty quickly: * * >>> def calculation(tries, size): * ... return (size / 2**32)**(tries - 1) * (1 - (size / 2**32)) * ... * >>> calculation(1, 2**20 * 3) * 0.999267578125 * >>> calculation(2, 2**20 * 3) * 0.0007318854331970215 * >>> calculation(3, 2**20 * 3) * 5.360489012673497e-07 * >>> calculation(4, 2**20 * 3) * 3.9261394135792216e-10 * * At the moment, we don't do any masking, so this algorithm isn't exactly * constant time in either the random guessing or in the hash list lookup. We * could require a minimum of 3 tries, which would successfully mask the * guessing. this would not, however, help with the growing hash lengths, which * is another thing to consider moving forward. */ __le32 wg_index_hashtable_insert(struct index_hashtable *table, struct index_hashtable_entry *entry) { struct index_hashtable_entry *existing_entry; spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); rcu_read_lock_bh(); search_unused_slot: /* First we try to find an unused slot, randomly, while unlocked. */ entry->index = (__force __le32)get_random_u32(); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) /* If it's already in use, we continue searching. */ goto search_unused_slot; } /* Once we've found an unused slot, we lock it, and then double-check * that nobody else stole it from us. */ spin_lock_bh(&table->lock); hlist_for_each_entry_rcu_bh(existing_entry, index_bucket(table, entry->index), index_hash) { if (existing_entry->index == entry->index) { spin_unlock_bh(&table->lock); /* If it was stolen, we start over. */ goto search_unused_slot; } } /* Otherwise, we know we have it exclusively (since we're locked), * so we insert. */ hlist_add_head_rcu(&entry->index_hash, index_bucket(table, entry->index)); spin_unlock_bh(&table->lock); rcu_read_unlock_bh(); return entry->index; } bool wg_index_hashtable_replace(struct index_hashtable *table, struct index_hashtable_entry *old, struct index_hashtable_entry *new) { bool ret; spin_lock_bh(&table->lock); ret = !hlist_unhashed(&old->index_hash); if (unlikely(!ret)) goto out; new->index = old->index; hlist_replace_rcu(&old->index_hash, &new->index_hash); /* Calling init here NULLs out index_hash, and in fact after this * function returns, it's theoretically possible for this to get * reinserted elsewhere. That means the RCU lookup below might either * terminate early or jump between buckets, in which case the packet * simply gets dropped, which isn't terrible. */ INIT_HLIST_NODE(&old->index_hash); out: spin_unlock_bh(&table->lock); return ret; } void wg_index_hashtable_remove(struct index_hashtable *table, struct index_hashtable_entry *entry) { spin_lock_bh(&table->lock); hlist_del_init_rcu(&entry->index_hash); spin_unlock_bh(&table->lock); } /* Returns a strong reference to a entry->peer */ struct index_hashtable_entry * wg_index_hashtable_lookup(struct index_hashtable *table, const enum index_hashtable_type type_mask, const __le32 index, struct wg_peer **peer) { struct index_hashtable_entry *iter_entry, *entry = NULL; rcu_read_lock_bh(); hlist_for_each_entry_rcu_bh(iter_entry, index_bucket(table, index), index_hash) { if (iter_entry->index == index) { if (likely(iter_entry->type & type_mask)) entry = iter_entry; break; } } if (likely(entry)) { entry->peer = wg_peer_get_maybe_zero(entry->peer); if (likely(entry->peer)) *peer = entry->peer; else entry = NULL; } rcu_read_unlock_bh(); return entry; } |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ */ #ifndef __EROFS_FS_COMPRESS_H #define __EROFS_FS_COMPRESS_H #include "internal.h" struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; unsigned int alg; /* the algorithm for decompression */ bool inplace_io, partial_decoding, fillgaps; gfp_t gfp; /* allocation flags for extra temporary buffers */ }; struct z_erofs_decompressor { int (*config)(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size); int (*decompress)(struct z_erofs_decompress_req *rq, struct page **pagepool); int (*init)(void); void (*exit)(void); char *name; }; /* some special page->private (unsigned long, see below) */ #define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) #define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) /* * For all pages in a pcluster, page->private should be one of * Type Last 2bits page->private * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE * cached/managed page 00 pointer to z_erofs_pcluster * online page (file-backed, 01/10/11 sub-index << 2 | count * some pages can be used for inplace I/O) * * page->mapping should be one of * Type page->mapping * short-lived page NULL * preallocated page NULL * cached/managed page non-NULL or NULL (invalidated/truncated page) * online page non-NULL * * For all managed pages, PG_private should be set with 1 extra refcount, * which is used for page reclaim / migration. */ /* * Currently, short-lived pages are pages directly from buddy system * with specific page->private (Z_EROFS_SHORTLIVED_PAGE). * In the future world of Memdescs, it should be type 0 (Misc) memory * which type can be checked with a new helper. */ static inline bool z_erofs_is_shortlived_page(struct page *page) { return page->private == Z_EROFS_SHORTLIVED_PAGE; } static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, struct page *page) { if (!z_erofs_is_shortlived_page(page)) return false; erofs_pagepool_add(pagepool, page); return true; } extern const struct z_erofs_decompressor z_erofs_lzma_decomp; extern const struct z_erofs_decompressor z_erofs_deflate_decomp; extern const struct z_erofs_decompressor z_erofs_zstd_decomp; extern const struct z_erofs_decompressor *z_erofs_decomp[]; struct z_erofs_stream_dctx { struct z_erofs_decompress_req *rq; unsigned int inpages, outpages; /* # of {en,de}coded pages */ int no, ni; /* the current {en,de}coded page # */ unsigned int avail_out; /* remaining bytes in the decoded buffer */ unsigned int inbuf_pos, inbuf_sz; /* current status of the encoded buffer */ u8 *kin, *kout; /* buffer mapped pointers */ void *bounce; /* bounce buffer for inplace I/Os */ bool bounced; /* is the bounce buffer used now? */ }; int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, void **src, struct page **pgpl); int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); int __init z_erofs_init_decompressor(void); void z_erofs_exit_decompressor(void); #endif |
| 18 15 17 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 | /* * linux/fs/nls/mac-roman.c * * Charset macroman translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* 0x90 */ 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* 0xa0 */ 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* 0xb0 */ 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, /* 0xc0 */ 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* 0xd0 */ 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, 0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02, /* 0xe0 */ 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* 0xf0 */ 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0xc1, 0xa2, 0xa3, 0x00, 0xb4, 0x00, 0xa4, /* 0xa0-0xa7 */ 0xac, 0xa9, 0xbb, 0xc7, 0xc2, 0x00, 0xa8, 0xf8, /* 0xa8-0xaf */ 0xa1, 0xb1, 0x00, 0x00, 0xab, 0xb5, 0xa6, 0xe1, /* 0xb0-0xb7 */ 0xfc, 0x00, 0xbc, 0xc8, 0x00, 0x00, 0x00, 0xc0, /* 0xb8-0xbf */ 0xcb, 0xe7, 0xe5, 0xcc, 0x80, 0x81, 0xae, 0x82, /* 0xc0-0xc7 */ 0xe9, 0x83, 0xe6, 0xe8, 0xed, 0xea, 0xeb, 0xec, /* 0xc8-0xcf */ 0x00, 0x84, 0xf1, 0xee, 0xef, 0xcd, 0x85, 0x00, /* 0xd0-0xd7 */ 0xaf, 0xf4, 0xf2, 0xf3, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ 0x88, 0x87, 0x89, 0x8b, 0x8a, 0x8c, 0xbe, 0x8d, /* 0xe0-0xe7 */ 0x8f, 0x8e, 0x90, 0x91, 0x93, 0x92, 0x94, 0x95, /* 0xe8-0xef */ 0x00, 0x96, 0x98, 0x97, 0x99, 0x9b, 0x9a, 0xd6, /* 0xf0-0xf7 */ 0xbf, 0x9d, 0x9c, 0x9e, 0x9f, 0x00, 0x00, 0xd8, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0xce, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0xd9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf6, 0xff, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xf9, 0xfa, 0xfb, 0xfe, 0xf7, 0xfd, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0xbd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0xe2, 0x00, 0xd2, 0xd3, 0xe3, 0x00, /* 0x18-0x1f */ 0xa0, 0xe0, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0xdc, 0xdd, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0xdb, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb8, /* 0x08-0x0f */ 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page25[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char pagef8[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, /* 0xf8-0xff */ }; static const unsigned char pagefb[256] = { 0x00, 0xde, 0xdf, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, page22, NULL, NULL, page25, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, pagef8, NULL, NULL, pagefb, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "macroman", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_macroman(void) { return register_nls(&table); } static void __exit exit_nls_macroman(void) { unregister_nls(&table); } module_init(init_nls_macroman) module_exit(exit_nls_macroman) MODULE_DESCRIPTION("NLS Codepage macroman"); MODULE_LICENSE("Dual BSD/GPL"); |
| 1 1 1 1 1 1 62 33 33 33 2 61 63 64 60 19 60 15 62 62 62 62 12 9 12 61 21 62 1 33 28 17 33 54 19 39 20 20 33 12 33 33 2 32 33 33 20 18 1 18 18 55 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2015, Google, Inc. * * This was originally taken from fs/mpage.c * * The ext4_mpage_readpages() function here is intended to * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. * * This will allow us to attach a callback function to support ext4 * encryption. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "ext4.h" #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static mempool_t *bio_post_read_ctx_pool; /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, STEP_MAX, }; struct bio_post_read_ctx { struct bio *bio; struct work_struct work; unsigned int cur_step; unsigned int enabled_steps; }; static void __read_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if (fscrypt_decrypt_bio(bio)) bio_post_read_processing(ctx); else __read_end_io(bio); } static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; /* * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. * This is safe because verity is the last post-read step. */ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; fsverity_verify_bio(bio); __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { /* * We use different work queues for decryption and for verity because * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { INIT_WORK(&ctx->work, decrypt_work); fscrypt_enqueue_decrypt_work(&ctx->work); return; } ctx->cur_step++; fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); fsverity_enqueue_verify_work(&ctx->work); return; } ctx->cur_step++; fallthrough; default: __read_end_io(ctx->bio); } } static bool bio_post_read_required(struct bio *bio) { return bio->bi_private && !bio->bi_status; } /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_end_io(struct bio *bio) { if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } __read_end_io(bio); } static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void ext4_set_bio_post_read_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx) { unsigned int post_read_steps = 0; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { /* Due to the mempool, this never fails. */ struct bio_post_read_ctx *ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } } static inline loff_t ext4_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; if (rac) folio = readahead_folio(rac); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk + map_offset + relative_block; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (page_block && blocks[page_block-1] != map.m_pblk-1) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } } if (first_hole != blocks_per_page) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; folio_end_read(folio, true); continue; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; continue; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) block_read_full_folio(folio, ext4_get_block); else folio_unlock(folio); next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); return 0; } int __init ext4_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void ext4_exit_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); } |
| 3 3 5 5 5 5 1 6 5 12 1 28 20 27 2 12 3 3 8 5 8 40 1 1 2 6 6 7 3 11 9 21 15 12 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 | /* * linux/fs/nls/nls_euc-jp.c * * Added `OSF/JVC Recommended Code Set Conversion Specification * between Japanese EUC and Shift-JIS' support: <hirofumi@mail.parknet.co.jp> * (http://www.opengroup.or.jp/jvc/cde/sjis-euc-e.html) */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static struct nls_table *p_nls; #define IS_SJIS_LOW_BYTE(l) ((0x40 <= (l)) && ((l) <= 0xFC) && ((l) != 0x7F)) /* JIS X 0208 (include NEC spesial characters) */ #define IS_SJIS_JISX0208(h, l) ((((0x81 <= (h)) && ((h) <= 0x9F)) \ || ((0xE0 <= (h)) && ((h) <= 0xEA))) \ && IS_SJIS_LOW_BYTE(l)) #define IS_SJIS_JISX0201KANA(c) ((0xA1 <= (c)) && ((c) <= 0xDF)) #define IS_SJIS_UDC_LOW(h, l) (((0xF0 <= (h)) && ((h) <= 0xF4)) \ && IS_SJIS_LOW_BYTE(l)) #define IS_SJIS_UDC_HI(h, l) (((0xF5 <= (h)) && ((h) <= 0xF9)) \ && IS_SJIS_LOW_BYTE(l)) #define IS_SJIS_IBM(h, l) (((0xFA <= (h)) && ((h) <= 0xFC)) \ && IS_SJIS_LOW_BYTE(l)) #define IS_SJIS_NECIBM(h, l) (((0xED <= (h)) && ((h) <= 0xEE)) \ && IS_SJIS_LOW_BYTE(l)) #define MAP_SJIS2EUC(sjis_hi, sjis_lo, sjis_p, euc_hi, euc_lo, euc_p) { \ if ((sjis_lo) >= 0x9F) { \ (euc_hi) = (sjis_hi) * 2 - (((sjis_p) * 2 - (euc_p)) - 1); \ (euc_lo) = (sjis_lo) + 2; \ } else { \ (euc_hi) = (sjis_hi) * 2 - ((sjis_p) * 2 - (euc_p)); \ (euc_lo) = (sjis_lo) + ((sjis_lo) >= 0x7F ? 0x60 : 0x61); \ } \ } while(0) #define SS2 (0x8E) /* Single Shift 2 */ #define SS3 (0x8F) /* Single Shift 3 */ #define IS_EUC_BYTE(c) ((0xA1 <= (c)) && ((c) <= 0xFE)) #define IS_EUC_JISX0208(h, l) (IS_EUC_BYTE(h) && IS_EUC_BYTE(l)) #define IS_EUC_JISX0201KANA(h, l) (((h) == SS2) && (0xA1 <= (l) && (l) <= 0xDF)) #define IS_EUC_UDC_LOW(h, l) (((0xF5 <= (h)) && ((h) <= 0xFE)) \ && IS_EUC_BYTE(l)) #define IS_EUC_UDC_HI(h, l) IS_EUC_UDC_LOW(h, l) /* G3 block */ #define MAP_EUC2SJIS(euc_hi, euc_lo, euc_p, sjis_hi, sjis_lo, sjis_p) { \ if ((euc_hi) & 1) { \ (sjis_hi) = (euc_hi) / 2 + ((sjis_p) - (euc_p) / 2); \ (sjis_lo) = (euc_lo) - ((euc_lo) >= 0xE0 ? 0x60 : 0x61); \ } else { \ (sjis_hi) = (euc_hi) / 2 + (((sjis_p) - (euc_p) / 2) - 1); \ (sjis_lo) = (euc_lo) - 2; \ } \ } while(0) /* SJIS IBM extended characters to EUC map */ static const unsigned char sjisibm2euc_map[][2] = { {0xF3, 0xF3}, {0xF3, 0xF4}, {0xF3, 0xF5}, {0xF3, 0xF6}, {0xF3, 0xF7}, {0xF3, 0xF8}, {0xF3, 0xF9}, {0xF3, 0xFA}, {0xF3, 0xFB}, {0xF3, 0xFC}, {0xF3, 0xFD}, {0xF3, 0xFE}, {0xF4, 0xA1}, {0xF4, 0xA2}, {0xF4, 0xA3}, {0xF4, 0xA4}, {0xF4, 0xA5}, {0xF4, 0xA6}, {0xF4, 0xA7}, {0xF4, 0xA8}, {0xA2, 0xCC}, {0xA2, 0xC3}, {0xF4, 0xA9}, {0xF4, 0xAA}, {0xF4, 0xAB}, {0xF4, 0xAC}, {0xF4, 0xAD}, {0xA2, 0xE8}, {0xD4, 0xE3}, {0xDC, 0xDF}, {0xE4, 0xE9}, {0xE3, 0xF8}, {0xD9, 0xA1}, {0xB1, 0xBB}, {0xF4, 0xAE}, {0xC2, 0xAD}, {0xC3, 0xFC}, {0xE4, 0xD0}, {0xC2, 0xBF}, {0xBC, 0xF4}, {0xB0, 0xA9}, {0xB0, 0xC8}, {0xF4, 0xAF}, {0xB0, 0xD2}, {0xB0, 0xD4}, {0xB0, 0xE3}, {0xB0, 0xEE}, {0xB1, 0xA7}, {0xB1, 0xA3}, {0xB1, 0xAC}, {0xB1, 0xA9}, {0xB1, 0xBE}, {0xB1, 0xDF}, {0xB1, 0xD8}, {0xB1, 0xC8}, {0xB1, 0xD7}, {0xB1, 0xE3}, {0xB1, 0xF4}, {0xB1, 0xE1}, {0xB2, 0xA3}, {0xF4, 0xB0}, {0xB2, 0xBB}, {0xB2, 0xE6}, {0x00, 0x00}, {0xB2, 0xED}, {0xB2, 0xF5}, {0xB2, 0xFC}, {0xF4, 0xB1}, {0xB3, 0xB5}, {0xB3, 0xD8}, {0xB3, 0xDB}, {0xB3, 0xE5}, {0xB3, 0xEE}, {0xB3, 0xFB}, {0xF4, 0xB2}, {0xF4, 0xB3}, {0xB4, 0xC0}, {0xB4, 0xC7}, {0xB4, 0xD0}, {0xB4, 0xDE}, {0xF4, 0xB4}, {0xB5, 0xAA}, {0xF4, 0xB5}, {0xB5, 0xAF}, {0xB5, 0xC4}, {0xB5, 0xE8}, {0xF4, 0xB6}, {0xB7, 0xC2}, {0xB7, 0xE4}, {0xB7, 0xE8}, {0xB7, 0xE7}, {0xF4, 0xB7}, {0xF4, 0xB8}, {0xF4, 0xB9}, {0xB8, 0xCE}, {0xB8, 0xE1}, {0xB8, 0xF5}, {0xB8, 0xF7}, {0xB8, 0xF8}, {0xB8, 0xFC}, {0xB9, 0xAF}, {0xB9, 0xB7}, {0xBA, 0xBE}, {0xBA, 0xDB}, {0xCD, 0xAA}, {0xBA, 0xE1}, {0xF4, 0xBA}, {0xBA, 0xEB}, {0xBB, 0xB3}, {0xBB, 0xB8}, {0xF4, 0xBB}, {0xBB, 0xCA}, {0xF4, 0xBC}, {0xF4, 0xBD}, {0xBB, 0xD0}, {0xBB, 0xDE}, {0xBB, 0xF4}, {0xBB, 0xF5}, {0xBB, 0xF9}, {0xBC, 0xE4}, {0xBC, 0xED}, {0xBC, 0xFE}, {0xF4, 0xBE}, {0xBD, 0xC2}, {0xBD, 0xE7}, {0xF4, 0xBF}, {0xBD, 0xF0}, {0xBE, 0xB0}, {0xBE, 0xAC}, {0xF4, 0xC0}, {0xBE, 0xB3}, {0xBE, 0xBD}, {0xBE, 0xCD}, {0xBE, 0xC9}, {0xBE, 0xE4}, {0xBF, 0xA8}, {0xBF, 0xC9}, {0xC0, 0xC4}, {0xC0, 0xE4}, {0xC0, 0xF4}, {0xC1, 0xA6}, {0xF4, 0xC1}, {0xC1, 0xF5}, {0xC1, 0xFC}, {0xF4, 0xC2}, {0xC1, 0xF8}, {0xC2, 0xAB}, {0xC2, 0xA1}, {0xC2, 0xA5}, {0xF4, 0xC3}, {0xC2, 0xB8}, {0xC2, 0xBA}, {0xF4, 0xC4}, {0xC2, 0xC4}, {0xC2, 0xD2}, {0xC2, 0xD7}, {0xC2, 0xDB}, {0xC2, 0xDE}, {0xC2, 0xED}, {0xC2, 0xF0}, {0xF4, 0xC5}, {0xC3, 0xA1}, {0xC3, 0xB5}, {0xC3, 0xC9}, {0xC3, 0xB9}, {0xF4, 0xC6}, {0xC3, 0xD8}, {0xC3, 0xFE}, {0xF4, 0xC7}, {0xC4, 0xCC}, {0xF4, 0xC8}, {0xC4, 0xD9}, {0xC4, 0xEA}, {0xC4, 0xFD}, {0xF4, 0xC9}, {0xC5, 0xA7}, {0xC5, 0xB5}, {0xC5, 0xB6}, {0xF4, 0xCA}, {0xC5, 0xD5}, {0xC6, 0xB8}, {0xC6, 0xD7}, {0xC6, 0xE0}, {0xC6, 0xEA}, {0xC6, 0xE3}, {0xC7, 0xA1}, {0xC7, 0xAB}, {0xC7, 0xC7}, {0xC7, 0xC3}, {0xC7, 0xCB}, {0xC7, 0xCF}, {0xC7, 0xD9}, {0xF4, 0xCB}, {0xF4, 0xCC}, {0xC7, 0xE6}, {0xC7, 0xEE}, {0xC7, 0xFC}, {0xC7, 0xEB}, {0xC7, 0xF0}, {0xC8, 0xB1}, {0xC8, 0xE5}, {0xC8, 0xF8}, {0xC9, 0xA6}, {0xC9, 0xAB}, {0xC9, 0xAD}, {0xF4, 0xCD}, {0xC9, 0xCA}, {0xC9, 0xD3}, {0xC9, 0xE9}, {0xC9, 0xE3}, {0xC9, 0xFC}, {0xC9, 0xF4}, {0xC9, 0xF5}, {0xF4, 0xCE}, {0xCA, 0xB3}, {0xCA, 0xBD}, {0xCA, 0xEF}, {0xCA, 0xF1}, {0xCB, 0xAE}, {0xF4, 0xCF}, {0xCB, 0xCA}, {0xCB, 0xE6}, {0xCB, 0xEA}, {0xCB, 0xF0}, {0xCB, 0xF4}, {0xCB, 0xEE}, {0xCC, 0xA5}, {0xCB, 0xF9}, {0xCC, 0xAB}, {0xCC, 0xAE}, {0xCC, 0xAD}, {0xCC, 0xB2}, {0xCC, 0xC2}, {0xCC, 0xD0}, {0xCC, 0xD9}, {0xF4, 0xD0}, {0xCD, 0xBB}, {0xF4, 0xD1}, {0xCE, 0xBB}, {0xF4, 0xD2}, {0xCE, 0xBA}, {0xCE, 0xC3}, {0xF4, 0xD3}, {0xCE, 0xF2}, {0xB3, 0xDD}, {0xCF, 0xD5}, {0xCF, 0xE2}, {0xCF, 0xE9}, {0xCF, 0xED}, {0xF4, 0xD4}, {0xF4, 0xD5}, {0xF4, 0xD6}, {0x00, 0x00}, {0xF4, 0xD7}, {0xD0, 0xE5}, {0xF4, 0xD8}, {0xD0, 0xE9}, {0xD1, 0xE8}, {0xF4, 0xD9}, {0xF4, 0xDA}, {0xD1, 0xEC}, {0xD2, 0xBB}, {0xF4, 0xDB}, {0xD3, 0xE1}, {0xD3, 0xE8}, {0xD4, 0xA7}, {0xF4, 0xDC}, {0xF4, 0xDD}, {0xD4, 0xD4}, {0xD4, 0xF2}, {0xD5, 0xAE}, {0xF4, 0xDE}, {0xD7, 0xDE}, {0xF4, 0xDF}, {0xD8, 0xA2}, {0xD8, 0xB7}, {0xD8, 0xC1}, {0xD8, 0xD1}, {0xD8, 0xF4}, {0xD9, 0xC6}, {0xD9, 0xC8}, {0xD9, 0xD1}, {0xF4, 0xE0}, {0xF4, 0xE1}, {0xF4, 0xE2}, {0xF4, 0xE3}, {0xF4, 0xE4}, {0xDC, 0xD3}, {0xDD, 0xC8}, {0xDD, 0xD4}, {0xDD, 0xEA}, {0xDD, 0xFA}, {0xDE, 0xA4}, {0xDE, 0xB0}, {0xF4, 0xE5}, {0xDE, 0xB5}, {0xDE, 0xCB}, {0xF4, 0xE6}, {0xDF, 0xB9}, {0xF4, 0xE7}, {0xDF, 0xC3}, {0xF4, 0xE8}, {0xF4, 0xE9}, {0xE0, 0xD9}, {0xF4, 0xEA}, {0xF4, 0xEB}, {0xE1, 0xE2}, {0xF4, 0xEC}, {0xF4, 0xED}, {0xF4, 0xEE}, {0xE2, 0xC7}, {0xE3, 0xA8}, {0xE3, 0xA6}, {0xE3, 0xA9}, {0xE3, 0xAF}, {0xE3, 0xB0}, {0xE3, 0xAA}, {0xE3, 0xAB}, {0xE3, 0xBC}, {0xE3, 0xC1}, {0xE3, 0xBF}, {0xE3, 0xD5}, {0xE3, 0xD8}, {0xE3, 0xD6}, {0xE3, 0xDF}, {0xE3, 0xE3}, {0xE3, 0xE1}, {0xE3, 0xD4}, {0xE3, 0xE9}, {0xE4, 0xA6}, {0xE3, 0xF1}, {0xE3, 0xF2}, {0xE4, 0xCB}, {0xE4, 0xC1}, {0xE4, 0xC3}, {0xE4, 0xBE}, {0xF4, 0xEF}, {0xE4, 0xC0}, {0xE4, 0xC7}, {0xE4, 0xBF}, {0xE4, 0xE0}, {0xE4, 0xDE}, {0xE4, 0xD1}, {0xF4, 0xF0}, {0xE4, 0xDC}, {0xE4, 0xD2}, {0xE4, 0xDB}, {0xE4, 0xD4}, {0xE4, 0xFA}, {0xE4, 0xEF}, {0xE5, 0xB3}, {0xE5, 0xBF}, {0xE5, 0xC9}, {0xE5, 0xD0}, {0xE5, 0xE2}, {0xE5, 0xEA}, {0xE5, 0xEB}, {0xF4, 0xF1}, {0xF4, 0xF2}, {0xF4, 0xF3}, {0xE6, 0xE8}, {0xE6, 0xEF}, {0xE7, 0xAC}, {0xF4, 0xF4}, {0xE7, 0xAE}, {0xF4, 0xF5}, {0xE7, 0xB1}, {0xF4, 0xF6}, {0xE7, 0xB2}, {0xE8, 0xB1}, {0xE8, 0xB6}, {0xF4, 0xF7}, {0xF4, 0xF8}, {0xE8, 0xDD}, {0xF4, 0xF9}, {0xF4, 0xFA}, {0xE9, 0xD1}, {0xF4, 0xFB}, {0xE9, 0xED}, {0xEA, 0xCD}, {0xF4, 0xFC}, {0xEA, 0xDB}, {0xEA, 0xE6}, {0xEA, 0xEA}, {0xEB, 0xA5}, {0xEB, 0xFB}, {0xEB, 0xFA}, {0xF4, 0xFD}, {0xEC, 0xD6}, {0xF4, 0xFE}, }; #define IS_EUC_IBM2JISX0208(h, l) \ (((h) == 0xA2 && (l) == 0xCC) || ((h) == 0xA2 && (l) == 0xE8)) /* EUC to SJIS IBM extended characters map (G3 JIS X 0212 block) */ static struct { unsigned short euc; unsigned char sjis[2]; } euc2sjisibm_jisx0212_map[] = { {0xA2C3, {0xFA, 0x55}}, {0xB0A9, {0xFA, 0x68}}, {0xB0C8, {0xFA, 0x69}}, {0xB0D2, {0xFA, 0x6B}}, {0xB0D4, {0xFA, 0x6C}}, {0xB0E3, {0xFA, 0x6D}}, {0xB0EE, {0xFA, 0x6E}}, {0xB1A3, {0xFA, 0x70}}, {0xB1A7, {0xFA, 0x6F}}, {0xB1A9, {0xFA, 0x72}}, {0xB1AC, {0xFA, 0x71}}, {0xB1BB, {0xFA, 0x61}}, {0xB1BE, {0xFA, 0x73}}, {0xB1C8, {0xFA, 0x76}}, {0xB1D7, {0xFA, 0x77}}, {0xB1D8, {0xFA, 0x75}}, {0xB1DF, {0xFA, 0x74}}, {0xB1E1, {0xFA, 0x7A}}, {0xB1E3, {0xFA, 0x78}}, {0xB1F4, {0xFA, 0x79}}, {0xB2A3, {0xFA, 0x7B}}, {0xB2BB, {0xFA, 0x7D}}, {0xB2E6, {0xFA, 0x7E}}, {0xB2ED, {0xFA, 0x80}}, {0xB2F5, {0xFA, 0x81}}, {0xB2FC, {0xFA, 0x82}}, {0xB3B5, {0xFA, 0x84}}, {0xB3D8, {0xFA, 0x85}}, {0xB3DB, {0xFA, 0x86}}, {0xB3DD, {0xFB, 0x77}}, {0xB3E5, {0xFA, 0x87}}, {0xB3EE, {0xFA, 0x88}}, {0xB3FB, {0xFA, 0x89}}, {0xB4C0, {0xFA, 0x8C}}, {0xB4C7, {0xFA, 0x8D}}, {0xB4D0, {0xFA, 0x8E}}, {0xB4DE, {0xFA, 0x8F}}, {0xB5AA, {0xFA, 0x91}}, {0xB5AF, {0xFA, 0x93}}, {0xB5C4, {0xFA, 0x94}}, {0xB5E8, {0xFA, 0x95}}, {0xB7C2, {0xFA, 0x97}}, {0xB7E4, {0xFA, 0x98}}, {0xB7E7, {0xFA, 0x9A}}, {0xB7E8, {0xFA, 0x99}}, {0xB8CE, {0xFA, 0x9E}}, {0xB8E1, {0xFA, 0x9F}}, {0xB8F5, {0xFA, 0xA0}}, {0xB8F7, {0xFA, 0xA1}}, {0xB8F8, {0xFA, 0xA2}}, {0xB8FC, {0xFA, 0xA3}}, {0xB9AF, {0xFA, 0xA4}}, {0xB9B7, {0xFA, 0xA5}}, {0xBABE, {0xFA, 0xA6}}, {0xBADB, {0xFA, 0xA7}}, {0xBAE1, {0xFA, 0xA9}}, {0xBAEB, {0xFA, 0xAB}}, {0xBBB3, {0xFA, 0xAC}}, {0xBBB8, {0xFA, 0xAD}}, {0xBBCA, {0xFA, 0xAF}}, {0xBBD0, {0xFA, 0xB2}}, {0xBBDE, {0xFA, 0xB3}}, {0xBBF4, {0xFA, 0xB4}}, {0xBBF5, {0xFA, 0xB5}}, {0xBBF9, {0xFA, 0xB6}}, {0xBCE4, {0xFA, 0xB7}}, {0xBCED, {0xFA, 0xB8}}, {0xBCF4, {0xFA, 0x67}}, {0xBCFE, {0xFA, 0xB9}}, {0xBDC2, {0xFA, 0xBB}}, {0xBDE7, {0xFA, 0xBC}}, {0xBDF0, {0xFA, 0xBE}}, {0xBEAC, {0xFA, 0xC0}}, {0xBEB0, {0xFA, 0xBF}}, {0xBEB3, {0xFA, 0xC2}}, {0xBEBD, {0xFA, 0xC3}}, {0xBEC9, {0xFA, 0xC5}}, {0xBECD, {0xFA, 0xC4}}, {0xBEE4, {0xFA, 0xC6}}, {0xBFA8, {0xFA, 0xC7}}, {0xBFC9, {0xFA, 0xC8}}, {0xC0C4, {0xFA, 0xC9}}, {0xC0E4, {0xFA, 0xCA}}, {0xC0F4, {0xFA, 0xCB}}, {0xC1A6, {0xFA, 0xCC}}, {0xC1F5, {0xFA, 0xCE}}, {0xC1F8, {0xFA, 0xD1}}, {0xC1FC, {0xFA, 0xCF}}, {0xC2A1, {0xFA, 0xD3}}, {0xC2A5, {0xFA, 0xD4}}, {0xC2AB, {0xFA, 0xD2}}, {0xC2AD, {0xFA, 0x63}}, {0xC2B8, {0xFA, 0xD6}}, {0xC2BA, {0xFA, 0xD7}}, {0xC2BF, {0xFA, 0x66}}, {0xC2C4, {0xFA, 0xD9}}, {0xC2D2, {0xFA, 0xDA}}, {0xC2D7, {0xFA, 0xDB}}, {0xC2DB, {0xFA, 0xDC}}, {0xC2DE, {0xFA, 0xDD}}, {0xC2ED, {0xFA, 0xDE}}, {0xC2F0, {0xFA, 0xDF}}, {0xC3A1, {0xFA, 0xE1}}, {0xC3B5, {0xFA, 0xE2}}, {0xC3B9, {0xFA, 0xE4}}, {0xC3C9, {0xFA, 0xE3}}, {0xC3D8, {0xFA, 0xE6}}, {0xC3FC, {0xFA, 0x64}}, {0xC3FE, {0xFA, 0xE7}}, {0xC4CC, {0xFA, 0xE9}}, {0xC4D9, {0xFA, 0xEB}}, {0xC4EA, {0xFA, 0xEC}}, {0xC4FD, {0xFA, 0xED}}, {0xC5A7, {0xFA, 0xEF}}, {0xC5B5, {0xFA, 0xF0}}, {0xC5B6, {0xFA, 0xF1}}, {0xC5D5, {0xFA, 0xF3}}, {0xC6B8, {0xFA, 0xF4}}, {0xC6D7, {0xFA, 0xF5}}, {0xC6E0, {0xFA, 0xF6}}, {0xC6E3, {0xFA, 0xF8}}, {0xC6EA, {0xFA, 0xF7}}, {0xC7A1, {0xFA, 0xF9}}, {0xC7AB, {0xFA, 0xFA}}, {0xC7C3, {0xFA, 0xFC}}, {0xC7C7, {0xFA, 0xFB}}, {0xC7CB, {0xFB, 0x40}}, {0xC7CF, {0xFB, 0x41}}, {0xC7D9, {0xFB, 0x42}}, {0xC7E6, {0xFB, 0x45}}, {0xC7EB, {0xFB, 0x48}}, {0xC7EE, {0xFB, 0x46}}, {0xC7F0, {0xFB, 0x49}}, {0xC7FC, {0xFB, 0x47}}, {0xC8B1, {0xFB, 0x4A}}, {0xC8E5, {0xFB, 0x4B}}, {0xC8F8, {0xFB, 0x4C}}, {0xC9A6, {0xFB, 0x4D}}, {0xC9AB, {0xFB, 0x4E}}, {0xC9AD, {0xFB, 0x4F}}, {0xC9CA, {0xFB, 0x51}}, {0xC9D3, {0xFB, 0x52}}, {0xC9E3, {0xFB, 0x54}}, {0xC9E9, {0xFB, 0x53}}, {0xC9F4, {0xFB, 0x56}}, {0xC9F5, {0xFB, 0x57}}, {0xC9FC, {0xFB, 0x55}}, {0xCAB3, {0xFB, 0x59}}, {0xCABD, {0xFB, 0x5A}}, {0xCAEF, {0xFB, 0x5B}}, {0xCAF1, {0xFB, 0x5C}}, {0xCBAE, {0xFB, 0x5D}}, {0xCBCA, {0xFB, 0x5F}}, {0xCBE6, {0xFB, 0x60}}, {0xCBEA, {0xFB, 0x61}}, {0xCBEE, {0xFB, 0x64}}, {0xCBF0, {0xFB, 0x62}}, {0xCBF4, {0xFB, 0x63}}, {0xCBF9, {0xFB, 0x66}}, {0xCCA5, {0xFB, 0x65}}, {0xCCAB, {0xFB, 0x67}}, {0xCCAD, {0xFB, 0x69}}, {0xCCAE, {0xFB, 0x68}}, {0xCCB2, {0xFB, 0x6A}}, {0xCCC2, {0xFB, 0x6B}}, {0xCCD0, {0xFB, 0x6C}}, {0xCCD9, {0xFB, 0x6D}}, {0xCDAA, {0xFA, 0xA8}}, {0xCDBB, {0xFB, 0x6F}}, {0xCEBA, {0xFB, 0x73}}, {0xCEBB, {0xFB, 0x71}}, {0xCEC3, {0xFB, 0x74}}, {0xCEF2, {0xFB, 0x76}}, {0xCFD5, {0xFB, 0x78}}, {0xCFE2, {0xFB, 0x79}}, {0xCFE9, {0xFB, 0x7A}}, {0xCFED, {0xFB, 0x7B}}, {0xD0E5, {0xFB, 0x81}}, {0xD0E9, {0xFB, 0x83}}, {0xD1E8, {0xFB, 0x84}}, {0xD1EC, {0xFB, 0x87}}, {0xD2BB, {0xFB, 0x88}}, {0xD3E1, {0xFB, 0x8A}}, {0xD3E8, {0xFB, 0x8B}}, {0xD4A7, {0xFB, 0x8C}}, {0xD4D4, {0xFB, 0x8F}}, {0xD4E3, {0xFA, 0x5C}}, {0xD4F2, {0xFB, 0x90}}, {0xD5AE, {0xFB, 0x91}}, {0xD7DE, {0xFB, 0x93}}, {0xD8A2, {0xFB, 0x95}}, {0xD8B7, {0xFB, 0x96}}, {0xD8C1, {0xFB, 0x97}}, {0xD8D1, {0xFB, 0x98}}, {0xD8F4, {0xFB, 0x99}}, {0xD9A1, {0xFA, 0x60}}, {0xD9C6, {0xFB, 0x9A}}, {0xD9C8, {0xFB, 0x9B}}, {0xD9D1, {0xFB, 0x9C}}, {0xDCD3, {0xFB, 0xA2}}, {0xDCDF, {0xFA, 0x5D}}, {0xDDC8, {0xFB, 0xA3}}, {0xDDD4, {0xFB, 0xA4}}, {0xDDEA, {0xFB, 0xA5}}, {0xDDFA, {0xFB, 0xA6}}, {0xDEA4, {0xFB, 0xA7}}, {0xDEB0, {0xFB, 0xA8}}, {0xDEB5, {0xFB, 0xAA}}, {0xDECB, {0xFB, 0xAB}}, {0xDFB9, {0xFB, 0xAD}}, {0xDFC3, {0xFB, 0xAF}}, {0xE0D9, {0xFB, 0xB2}}, {0xE1E2, {0xFB, 0xB5}}, {0xE2C7, {0xFB, 0xB9}}, {0xE3A6, {0xFB, 0xBB}}, {0xE3A8, {0xFB, 0xBA}}, {0xE3A9, {0xFB, 0xBC}}, {0xE3AA, {0xFB, 0xBF}}, {0xE3AB, {0xFB, 0xC0}}, {0xE3AF, {0xFB, 0xBD}}, {0xE3B0, {0xFB, 0xBE}}, {0xE3BC, {0xFB, 0xC1}}, {0xE3BF, {0xFB, 0xC3}}, {0xE3C1, {0xFB, 0xC2}}, {0xE3D4, {0xFB, 0xCA}}, {0xE3D5, {0xFB, 0xC4}}, {0xE3D6, {0xFB, 0xC6}}, {0xE3D8, {0xFB, 0xC5}}, {0xE3DF, {0xFB, 0xC7}}, {0xE3E1, {0xFB, 0xC9}}, {0xE3E3, {0xFB, 0xC8}}, {0xE3E9, {0xFB, 0xCB}}, {0xE3F1, {0xFB, 0xCD}}, {0xE3F2, {0xFB, 0xCE}}, {0xE3F8, {0xFA, 0x5F}}, {0xE4A6, {0xFB, 0xCC}}, {0xE4BE, {0xFB, 0xD2}}, {0xE4BF, {0xFB, 0xD6}}, {0xE4C0, {0xFB, 0xD4}}, {0xE4C1, {0xFB, 0xD0}}, {0xE4C3, {0xFB, 0xD1}}, {0xE4C7, {0xFB, 0xD5}}, {0xE4CB, {0xFB, 0xCF}}, {0xE4D0, {0xFA, 0x65}}, {0xE4D1, {0xFB, 0xD9}}, {0xE4D2, {0xFB, 0xDC}}, {0xE4D4, {0xFB, 0xDE}}, {0xE4DB, {0xFB, 0xDD}}, {0xE4DC, {0xFB, 0xDB}}, {0xE4DE, {0xFB, 0xD8}}, {0xE4E0, {0xFB, 0xD7}}, {0xE4E9, {0xFA, 0x5E}}, {0xE4EF, {0xFB, 0xE0}}, {0xE4FA, {0xFB, 0xDF}}, {0xE5B3, {0xFB, 0xE1}}, {0xE5BF, {0xFB, 0xE2}}, {0xE5C9, {0xFB, 0xE3}}, {0xE5D0, {0xFB, 0xE4}}, {0xE5E2, {0xFB, 0xE5}}, {0xE5EA, {0xFB, 0xE6}}, {0xE5EB, {0xFB, 0xE7}}, {0xE6E8, {0xFB, 0xEB}}, {0xE6EF, {0xFB, 0xEC}}, {0xE7AC, {0xFB, 0xED}}, {0xE7AE, {0xFB, 0xEF}}, {0xE7B1, {0xFB, 0xF1}}, {0xE7B2, {0xFB, 0xF3}}, {0xE8B1, {0xFB, 0xF4}}, {0xE8B6, {0xFB, 0xF5}}, {0xE8DD, {0xFB, 0xF8}}, {0xE9D1, {0xFB, 0xFB}}, {0xE9ED, {0xFC, 0x40}}, {0xEACD, {0xFC, 0x41}}, {0xEADB, {0xFC, 0x43}}, {0xEAE6, {0xFC, 0x44}}, {0xEAEA, {0xFC, 0x45}}, {0xEBA5, {0xFC, 0x46}}, {0xEBFA, {0xFC, 0x48}}, {0xEBFB, {0xFC, 0x47}}, {0xECD6, {0xFC, 0x4A}}, }; /* EUC to SJIS IBM extended characters map (G3 Upper block) */ static const unsigned char euc2sjisibm_g3upper_map[][2] = { {0xFA, 0x40}, {0xFA, 0x41}, {0xFA, 0x42}, {0xFA, 0x43}, {0xFA, 0x44}, {0xFA, 0x45}, {0xFA, 0x46}, {0xFA, 0x47}, {0xFA, 0x48}, {0xFA, 0x49}, {0xFA, 0x4A}, {0xFA, 0x4B}, {0xFA, 0x4C}, {0xFA, 0x4D}, {0xFA, 0x4E}, {0xFA, 0x4F}, {0xFA, 0x50}, {0xFA, 0x51}, {0xFA, 0x52}, {0xFA, 0x53}, {0xFA, 0x56}, {0xFA, 0x57}, {0xFA, 0x58}, {0xFA, 0x59}, {0xFA, 0x5A}, {0xFA, 0x62}, {0xFA, 0x6A}, {0xFA, 0x7C}, {0xFA, 0x83}, {0xFA, 0x8A}, {0xFA, 0x8B}, {0xFA, 0x90}, {0xFA, 0x92}, {0xFA, 0x96}, {0xFA, 0x9B}, {0xFA, 0x9C}, {0xFA, 0x9D}, {0xFA, 0xAA}, {0xFA, 0xAE}, {0xFA, 0xB0}, {0xFA, 0xB1}, {0xFA, 0xBA}, {0xFA, 0xBD}, {0xFA, 0xC1}, {0xFA, 0xCD}, {0xFA, 0xD0}, {0xFA, 0xD5}, {0xFA, 0xD8}, {0xFA, 0xE0}, {0xFA, 0xE5}, {0xFA, 0xE8}, {0xFA, 0xEA}, {0xFA, 0xEE}, {0xFA, 0xF2}, {0xFB, 0x43}, {0xFB, 0x44}, {0xFB, 0x50}, {0xFB, 0x58}, {0xFB, 0x5E}, {0xFB, 0x6E}, {0xFB, 0x70}, {0xFB, 0x72}, {0xFB, 0x75}, {0xFB, 0x7C}, {0xFB, 0x7D}, {0xFB, 0x7E}, {0xFB, 0x80}, {0xFB, 0x82}, {0xFB, 0x85}, {0xFB, 0x86}, {0xFB, 0x89}, {0xFB, 0x8D}, {0xFB, 0x8E}, {0xFB, 0x92}, {0xFB, 0x94}, {0xFB, 0x9D}, {0xFB, 0x9E}, {0xFB, 0x9F}, {0xFB, 0xA0}, {0xFB, 0xA1}, {0xFB, 0xA9}, {0xFB, 0xAC}, {0xFB, 0xAE}, {0xFB, 0xB0}, {0xFB, 0xB1}, {0xFB, 0xB3}, {0xFB, 0xB4}, {0xFB, 0xB6}, {0xFB, 0xB7}, {0xFB, 0xB8}, {0xFB, 0xD3}, {0xFB, 0xDA}, {0xFB, 0xE8}, {0xFB, 0xE9}, {0xFB, 0xEA}, {0xFB, 0xEE}, {0xFB, 0xF0}, {0xFB, 0xF2}, {0xFB, 0xF6}, {0xFB, 0xF7}, {0xFB, 0xF9}, {0xFB, 0xFA}, {0xFB, 0xFC}, {0xFC, 0x42}, {0xFC, 0x49}, {0xFC, 0x4B}, }; static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi, const unsigned char sjis_lo); static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo); static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo); static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo); static inline int sjisnec2sjisibm(unsigned char *sjisibm, const unsigned char sjisnec_hi, const unsigned char sjisnec_lo); /* SJIS IBM extended characters to EUC */ static inline int sjisibm2euc(unsigned char *euc, const unsigned char sjis_hi, const unsigned char sjis_lo) { int index; index = ((sjis_hi - 0xFA) * (0xFD - 0x40)) + (sjis_lo - 0x40); if (IS_EUC_IBM2JISX0208(sjisibm2euc_map[index][0], sjisibm2euc_map[index][1])) { euc[0] = sjisibm2euc_map[index][0]; euc[1] = sjisibm2euc_map[index][1]; return 2; } else { euc[0] = SS3; euc[1] = sjisibm2euc_map[index][0]; euc[2] = sjisibm2euc_map[index][1]; return 3; } } /* EUC to SJIS IBM extended characters (G3 JIS X 0212 block) */ static inline int euc2sjisibm_jisx0212(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo) { int index, min_index, max_index; unsigned short euc; min_index = 0; max_index = ARRAY_SIZE(euc2sjisibm_jisx0212_map) - 1; euc = (euc_hi << 8) | euc_lo; while (min_index <= max_index) { index = (min_index + max_index) / 2; if (euc < euc2sjisibm_jisx0212_map[index].euc) max_index = index - 1; else min_index = index + 1; if (euc == euc2sjisibm_jisx0212_map[index].euc) { sjis[0] = euc2sjisibm_jisx0212_map[index].sjis[0]; sjis[1] = euc2sjisibm_jisx0212_map[index].sjis[1]; return 3; } } return 0; } /* EUC to SJIS IBM extended characters (G3 Upper block) */ static inline int euc2sjisibm_g3upper(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo) { int index; if (euc_hi == 0xF3) index = ((euc_hi << 8) | euc_lo) - 0xF3F3; else index = ((euc_hi << 8) | euc_lo) - 0xF4A1 + 12; if ((index < 0) || (index >= ARRAY_SIZE(euc2sjisibm_g3upper_map))) return 0; sjis[0] = euc2sjisibm_g3upper_map[index][0]; sjis[1] = euc2sjisibm_g3upper_map[index][1]; return 3; } /* EUC to SJIS IBM extended characters (G3 block) */ static inline int euc2sjisibm(unsigned char *sjis, const unsigned char euc_hi, const unsigned char euc_lo) { int n; #if 0 if ((euc_hi == 0xA2) && (euc_lo == 0xCC)) { sjis[0] = 0xFA; sjis[1] = 0x54; return 2; } else if ((euc_hi == 0xA2) && (euc_lo == 0xE8)) { sjis[0] = 0xFA; sjis[1] = 0x5B; return 2; } #endif if ((n = euc2sjisibm_g3upper(sjis, euc_hi, euc_lo))) { return n; } else if ((n = euc2sjisibm_jisx0212(sjis, euc_hi, euc_lo))) { return n; } return 0; } /* NEC/IBM extended characters to IBM extended characters */ static inline int sjisnec2sjisibm(unsigned char *sjisibm, const unsigned char sjisnec_hi, const unsigned char sjisnec_lo) { int count; if (! IS_SJIS_NECIBM(sjisnec_hi, sjisnec_lo)) return 0; if ((sjisnec_hi == 0xEE) && (sjisnec_lo == 0xF9)) { sjisibm[0] = 0x81; sjisibm[1] = 0xCA; return 2; } if ((sjisnec_hi == 0xEE) && (sjisnec_lo >= 0xEF)) { count = (sjisnec_hi << 8 | sjisnec_lo) - (sjisnec_lo <= 0xF9 ? 0xEEEF : (0xEEEF - 10)); } else { count = (sjisnec_hi - 0xED) * (0xFC - 0x40) + (sjisnec_lo - 0x40) + (0x5C - 0x40); if (sjisnec_lo >= 0x7F) count--; } sjisibm[0] = 0xFA + (count / (0xFC - 0x40)); sjisibm[1] = 0x40 + (count % (0xFC - 0x40)); if (sjisibm[1] >= 0x7F) sjisibm[1]++; return 2; } static int uni2char(const wchar_t uni, unsigned char *out, int boundlen) { int n; if (!p_nls) return -EINVAL; if ((n = p_nls->uni2char(uni, out, boundlen)) < 0) return n; /* translate SJIS into EUC-JP */ if (n == 1) { if (IS_SJIS_JISX0201KANA(out[0])) { /* JIS X 0201 KANA */ if (boundlen < 2) return -ENAMETOOLONG; out[1] = out[0]; out[0] = SS2; return 2; } } else if (n == 2) { /* NEC/IBM extended characters to IBM extended characters */ sjisnec2sjisibm(out, out[0], out[1]); if (IS_SJIS_UDC_LOW(out[0], out[1])) { /* User defined characters half low */ MAP_SJIS2EUC(out[0], out[1], 0xF0, out[0], out[1], 0xF5); } else if (IS_SJIS_UDC_HI(out[0], out[1])) { /* User defined characters half high */ unsigned char ch, cl; if (boundlen < 3) return -ENAMETOOLONG; n = 3; ch = out[0]; cl = out[1]; out[0] = SS3; MAP_SJIS2EUC(ch, cl, 0xF5, out[1], out[2], 0xF5); } else if (IS_SJIS_IBM(out[0], out[1])) { /* IBM extended characters */ unsigned char euc[3], i; n = sjisibm2euc(euc, out[0], out[1]); if (boundlen < n) return -ENAMETOOLONG; for (i = 0; i < n; i++) out[i] = euc[i]; } else if (IS_SJIS_JISX0208(out[0], out[1])) { /* JIS X 0208 (include NEC special characters) */ out[0] = (out[0]^0xA0)*2 + 0x5F; if (out[1] > 0x9E) out[0]++; if (out[1] < 0x7F) out[1] = out[1] + 0x61; else if (out[1] < 0x9F) out[1] = out[1] + 0x60; else out[1] = out[1] + 0x02; } else { /* Invalid characters */ return -EINVAL; } } else return -EINVAL; return n; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { unsigned char sjis_temp[2]; int euc_offset, n; if ( !p_nls ) return -EINVAL; if (boundlen <= 0) return -ENAMETOOLONG; /* translate EUC-JP into SJIS */ if (rawstring[0] > 0x7F) { if (rawstring[0] == SS3) { if (boundlen < 3) return -EINVAL; euc_offset = 3; if (IS_EUC_UDC_HI(rawstring[1], rawstring[2])) { /* User defined characters half high */ MAP_EUC2SJIS(rawstring[1], rawstring[2], 0xF5, sjis_temp[0], sjis_temp[1], 0xF5); } else if (euc2sjisibm(sjis_temp,rawstring[1],rawstring[2])) { /* IBM extended characters */ } else { /* JIS X 0212 and Invalid characters*/ return -EINVAL; /* 'GETA' with SJIS coding */ /* sjis_temp[0] = 0x81; */ /* sjis_temp[1] = 0xAC; */ } } else { if (boundlen < 2) return -EINVAL; euc_offset = 2; if (IS_EUC_JISX0201KANA(rawstring[0], rawstring[1])) { /* JIS X 0201 KANA */ sjis_temp[0] = rawstring[1]; sjis_temp[1] = 0x00; } else if (IS_EUC_UDC_LOW(rawstring[0], rawstring[1])) { /* User defined characters half low */ MAP_EUC2SJIS(rawstring[0], rawstring[1], 0xF5, sjis_temp[0], sjis_temp[1], 0xF0); } else if (IS_EUC_JISX0208(rawstring[0], rawstring[1])) { /* JIS X 0208 (include NEC spesial characters) */ sjis_temp[0] = ((rawstring[0]-0x5f)/2) ^ 0xA0; if (!(rawstring[0] & 1)) sjis_temp[1] = rawstring[1] - 0x02; else if (rawstring[1] < 0xE0) sjis_temp[1] = rawstring[1] - 0x61; else sjis_temp[1] = rawstring[1] - 0x60; } else { /* Invalid characters */ return -EINVAL; } } } else { euc_offset = 1; /* JIS X 0201 ROMAJI */ sjis_temp[0] = rawstring[0]; sjis_temp[1] = 0x00; } if ( (n = p_nls->char2uni(sjis_temp, sizeof(sjis_temp), uni)) < 0) return n; return euc_offset; } static struct nls_table table = { .charset = "euc-jp", .uni2char = uni2char, .char2uni = char2uni, }; static int __init init_nls_euc_jp(void) { p_nls = load_nls("cp932"); if (p_nls) { table.charset2upper = p_nls->charset2upper; table.charset2lower = p_nls->charset2lower; return register_nls(&table); } return -EINVAL; } static void __exit exit_nls_euc_jp(void) { unregister_nls(&table); unload_nls(p_nls); } module_init(init_nls_euc_jp) module_exit(exit_nls_euc_jp) MODULE_DESCRIPTION("NLS Japanese charset (EUC-JP)"); MODULE_LICENSE("Dual BSD/GPL"); |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | #undef TRACE_SYSTEM #define TRACE_SYSTEM qdisc #if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_QDISC_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> TRACE_EVENT(qdisc_dequeue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, int packets, struct sk_buff *skb), TP_ARGS(qdisc, txq, packets, skb), TP_STRUCT__entry( __field( struct Qdisc *, qdisc ) __field(const struct netdev_queue *, txq ) __field( int, packets ) __field( void *, skbaddr ) __field( int, ifindex ) __field( u32, handle ) __field( u32, parent ) __field( unsigned long, txq_state) ), /* skb==NULL indicate packets dequeued was 0, even when packets==1 */ TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->packets = skb ? packets : 0; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; __entry->txq_state = txq->state; ), TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); TRACE_EVENT(qdisc_enqueue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb), TP_ARGS(qdisc, txq, skb), TP_STRUCT__entry( __field(struct Qdisc *, qdisc) __field(const struct netdev_queue *, txq) __field(void *, skbaddr) __field(int, ifindex) __field(u32, handle) __field(u32, parent) ), TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; ), TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr) ); TRACE_EVENT(qdisc_reset, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ? qdisc_dev(q)->name : "(null)" ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_destroy, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q)->name ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_create, TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent), TP_ARGS(ops, dev, parent), TP_STRUCT__entry( __string( dev, dev->name ) __string( kind, ops->id ) __field( u32, parent ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = parent; ), TP_printk("dev=%s kind=%s parent=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent)) ); #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | /* * videobuf2-vmalloc.c - vmalloc memory allocator for videobuf2 * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #include <linux/io.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <media/videobuf2-v4l2.h> #include <media/videobuf2-vmalloc.h> #include <media/videobuf2-memops.h> struct vb2_vmalloc_buf { void *vaddr; struct frame_vector *vec; enum dma_data_direction dma_dir; unsigned long size; refcount_t refcount; struct vb2_vmarea_handler handler; struct dma_buf *dbuf; }; static void vb2_vmalloc_put(void *buf_priv); static void *vb2_vmalloc_alloc(struct vb2_buffer *vb, struct device *dev, unsigned long size) { struct vb2_vmalloc_buf *buf; buf = kzalloc(sizeof(*buf), GFP_KERNEL | vb->vb2_queue->gfp_flags); if (!buf) return ERR_PTR(-ENOMEM); buf->size = size; buf->vaddr = vmalloc_user(buf->size); if (!buf->vaddr) { pr_debug("vmalloc of size %ld failed\n", buf->size); kfree(buf); return ERR_PTR(-ENOMEM); } buf->dma_dir = vb->vb2_queue->dma_dir; buf->handler.refcount = &buf->refcount; buf->handler.put = vb2_vmalloc_put; buf->handler.arg = buf; refcount_set(&buf->refcount, 1); return buf; } static void vb2_vmalloc_put(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; if (refcount_dec_and_test(&buf->refcount)) { vfree(buf->vaddr); kfree(buf); } } static void *vb2_vmalloc_get_userptr(struct vb2_buffer *vb, struct device *dev, unsigned long vaddr, unsigned long size) { struct vb2_vmalloc_buf *buf; struct frame_vector *vec; int n_pages, offset, i; int ret = -ENOMEM; buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); buf->dma_dir = vb->vb2_queue->dma_dir; offset = vaddr & ~PAGE_MASK; buf->size = size; vec = vb2_create_framevec(vaddr, size, buf->dma_dir == DMA_FROM_DEVICE || buf->dma_dir == DMA_BIDIRECTIONAL); if (IS_ERR(vec)) { ret = PTR_ERR(vec); goto fail_pfnvec_create; } buf->vec = vec; n_pages = frame_vector_count(vec); if (frame_vector_to_pages(vec) < 0) { unsigned long *nums = frame_vector_pfns(vec); /* * We cannot get page pointers for these pfns. Check memory is * physically contiguous and use direct mapping. */ for (i = 1; i < n_pages; i++) if (nums[i-1] + 1 != nums[i]) goto fail_map; buf->vaddr = (__force void *) ioremap(__pfn_to_phys(nums[0]), size + offset); } else { buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1); } if (!buf->vaddr) goto fail_map; buf->vaddr += offset; return buf; fail_map: vb2_destroy_framevec(vec); fail_pfnvec_create: kfree(buf); return ERR_PTR(ret); } static void vb2_vmalloc_put_userptr(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; unsigned long vaddr = (unsigned long)buf->vaddr & PAGE_MASK; unsigned int i; struct page **pages; unsigned int n_pages; if (!buf->vec->is_pfns) { n_pages = frame_vector_count(buf->vec); if (vaddr) vm_unmap_ram((void *)vaddr, n_pages); if (buf->dma_dir == DMA_FROM_DEVICE || buf->dma_dir == DMA_BIDIRECTIONAL) { pages = frame_vector_pages(buf->vec); if (!WARN_ON_ONCE(IS_ERR(pages))) for (i = 0; i < n_pages; i++) set_page_dirty_lock(pages[i]); } } else { iounmap((__force void __iomem *)buf->vaddr); } vb2_destroy_framevec(buf->vec); kfree(buf); } static void *vb2_vmalloc_vaddr(struct vb2_buffer *vb, void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; if (!buf->vaddr) { pr_err("Address of an unallocated plane requested or cannot map user pointer\n"); return NULL; } return buf->vaddr; } static unsigned int vb2_vmalloc_num_users(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; return refcount_read(&buf->refcount); } static int vb2_vmalloc_mmap(void *buf_priv, struct vm_area_struct *vma) { struct vb2_vmalloc_buf *buf = buf_priv; int ret; if (!buf) { pr_err("No memory to map\n"); return -EINVAL; } ret = remap_vmalloc_range(vma, buf->vaddr, 0); if (ret) { pr_err("Remapping vmalloc memory, error: %d\n", ret); return ret; } /* * Make sure that vm_areas for 2 buffers won't be merged together */ vm_flags_set(vma, VM_DONTEXPAND); /* * Use common vm_area operations to track buffer refcount. */ vma->vm_private_data = &buf->handler; vma->vm_ops = &vb2_common_vm_ops; vma->vm_ops->open(vma); return 0; } #ifdef CONFIG_HAS_DMA /*********************************************/ /* DMABUF ops for exporters */ /*********************************************/ struct vb2_vmalloc_attachment { struct sg_table sgt; enum dma_data_direction dma_dir; }; static int vb2_vmalloc_dmabuf_ops_attach(struct dma_buf *dbuf, struct dma_buf_attachment *dbuf_attach) { struct vb2_vmalloc_attachment *attach; struct vb2_vmalloc_buf *buf = dbuf->priv; int num_pages = PAGE_ALIGN(buf->size) / PAGE_SIZE; struct sg_table *sgt; struct scatterlist *sg; void *vaddr = buf->vaddr; int ret; int i; attach = kzalloc(sizeof(*attach), GFP_KERNEL); if (!attach) return -ENOMEM; sgt = &attach->sgt; ret = sg_alloc_table(sgt, num_pages, GFP_KERNEL); if (ret) { kfree(attach); return ret; } for_each_sgtable_sg(sgt, sg, i) { struct page *page = vmalloc_to_page(vaddr); if (!page) { sg_free_table(sgt); kfree(attach); return -ENOMEM; } sg_set_page(sg, page, PAGE_SIZE, 0); vaddr += PAGE_SIZE; } attach->dma_dir = DMA_NONE; dbuf_attach->priv = attach; return 0; } static void vb2_vmalloc_dmabuf_ops_detach(struct dma_buf *dbuf, struct dma_buf_attachment *db_attach) { struct vb2_vmalloc_attachment *attach = db_attach->priv; struct sg_table *sgt; if (!attach) return; sgt = &attach->sgt; /* release the scatterlist cache */ if (attach->dma_dir != DMA_NONE) dma_unmap_sgtable(db_attach->dev, sgt, attach->dma_dir, 0); sg_free_table(sgt); kfree(attach); db_attach->priv = NULL; } static struct sg_table *vb2_vmalloc_dmabuf_ops_map( struct dma_buf_attachment *db_attach, enum dma_data_direction dma_dir) { struct vb2_vmalloc_attachment *attach = db_attach->priv; struct sg_table *sgt; sgt = &attach->sgt; /* return previously mapped sg table */ if (attach->dma_dir == dma_dir) return sgt; /* release any previous cache */ if (attach->dma_dir != DMA_NONE) { dma_unmap_sgtable(db_attach->dev, sgt, attach->dma_dir, 0); attach->dma_dir = DMA_NONE; } /* mapping to the client with new direction */ if (dma_map_sgtable(db_attach->dev, sgt, dma_dir, 0)) { pr_err("failed to map scatterlist\n"); return ERR_PTR(-EIO); } attach->dma_dir = dma_dir; return sgt; } static void vb2_vmalloc_dmabuf_ops_unmap(struct dma_buf_attachment *db_attach, struct sg_table *sgt, enum dma_data_direction dma_dir) { /* nothing to be done here */ } static void vb2_vmalloc_dmabuf_ops_release(struct dma_buf *dbuf) { /* drop reference obtained in vb2_vmalloc_get_dmabuf */ vb2_vmalloc_put(dbuf->priv); } static int vb2_vmalloc_dmabuf_ops_vmap(struct dma_buf *dbuf, struct iosys_map *map) { struct vb2_vmalloc_buf *buf = dbuf->priv; iosys_map_set_vaddr(map, buf->vaddr); return 0; } static int vb2_vmalloc_dmabuf_ops_mmap(struct dma_buf *dbuf, struct vm_area_struct *vma) { return vb2_vmalloc_mmap(dbuf->priv, vma); } static const struct dma_buf_ops vb2_vmalloc_dmabuf_ops = { .attach = vb2_vmalloc_dmabuf_ops_attach, .detach = vb2_vmalloc_dmabuf_ops_detach, .map_dma_buf = vb2_vmalloc_dmabuf_ops_map, .unmap_dma_buf = vb2_vmalloc_dmabuf_ops_unmap, .vmap = vb2_vmalloc_dmabuf_ops_vmap, .mmap = vb2_vmalloc_dmabuf_ops_mmap, .release = vb2_vmalloc_dmabuf_ops_release, }; static struct dma_buf *vb2_vmalloc_get_dmabuf(struct vb2_buffer *vb, void *buf_priv, unsigned long flags) { struct vb2_vmalloc_buf *buf = buf_priv; struct dma_buf *dbuf; DEFINE_DMA_BUF_EXPORT_INFO(exp_info); exp_info.ops = &vb2_vmalloc_dmabuf_ops; exp_info.size = buf->size; exp_info.flags = flags; exp_info.priv = buf; if (WARN_ON(!buf->vaddr)) return NULL; dbuf = dma_buf_export(&exp_info); if (IS_ERR(dbuf)) return NULL; /* dmabuf keeps reference to vb2 buffer */ refcount_inc(&buf->refcount); return dbuf; } #endif /* CONFIG_HAS_DMA */ /*********************************************/ /* callbacks for DMABUF buffers */ /*********************************************/ static int vb2_vmalloc_map_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map; int ret; ret = dma_buf_vmap_unlocked(buf->dbuf, &map); if (ret) return -EFAULT; buf->vaddr = map.vaddr; return 0; } static void vb2_vmalloc_unmap_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map = IOSYS_MAP_INIT_VADDR(buf->vaddr); dma_buf_vunmap_unlocked(buf->dbuf, &map); buf->vaddr = NULL; } static void vb2_vmalloc_detach_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map = IOSYS_MAP_INIT_VADDR(buf->vaddr); if (buf->vaddr) dma_buf_vunmap_unlocked(buf->dbuf, &map); kfree(buf); } static void *vb2_vmalloc_attach_dmabuf(struct vb2_buffer *vb, struct device *dev, struct dma_buf *dbuf, unsigned long size) { struct vb2_vmalloc_buf *buf; if (dbuf->size < size) return ERR_PTR(-EFAULT); buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); buf->dbuf = dbuf; buf->dma_dir = vb->vb2_queue->dma_dir; buf->size = size; return buf; } const struct vb2_mem_ops vb2_vmalloc_memops = { .alloc = vb2_vmalloc_alloc, .put = vb2_vmalloc_put, .get_userptr = vb2_vmalloc_get_userptr, .put_userptr = vb2_vmalloc_put_userptr, #ifdef CONFIG_HAS_DMA .get_dmabuf = vb2_vmalloc_get_dmabuf, #endif .map_dmabuf = vb2_vmalloc_map_dmabuf, .unmap_dmabuf = vb2_vmalloc_unmap_dmabuf, .attach_dmabuf = vb2_vmalloc_attach_dmabuf, .detach_dmabuf = vb2_vmalloc_detach_dmabuf, .vaddr = vb2_vmalloc_vaddr, .mmap = vb2_vmalloc_mmap, .num_users = vb2_vmalloc_num_users, }; EXPORT_SYMBOL_GPL(vb2_vmalloc_memops); MODULE_DESCRIPTION("vmalloc memory handling routines for videobuf2"); MODULE_AUTHOR("Pawel Osciak <pawel@osciak.com>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS(DMA_BUF); |
| 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 | // SPDX-License-Identifier: GPL-2.0+ /* * inode.c -- user mode filesystem api for usb gadget controllers * * Copyright (C) 2003-2004 David Brownell * Copyright (C) 2003 Agilent Technologies */ /* #define VERBOSE_DEBUG */ #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/pagemap.h> #include <linux/uts.h> #include <linux/wait.h> #include <linux/compiler.h> #include <linux/uaccess.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/kthread.h> #include <linux/aio.h> #include <linux/uio.h> #include <linux/refcount.h> #include <linux/delay.h> #include <linux/device.h> #include <linux/moduleparam.h> #include <linux/usb/gadgetfs.h> #include <linux/usb/gadget.h> #include <linux/usb/composite.h> /* for USB_GADGET_DELAYED_STATUS */ /* Undef helpers from linux/usb/composite.h as gadgetfs redefines them */ #undef DBG #undef ERROR #undef INFO /* * The gadgetfs API maps each endpoint to a file descriptor so that you * can use standard synchronous read/write calls for I/O. There's some * O_NONBLOCK and O_ASYNC/FASYNC style i/o support. Example usermode * drivers show how this works in practice. You can also use AIO to * eliminate I/O gaps between requests, to help when streaming data. * * Key parts that must be USB-specific are protocols defining how the * read/write operations relate to the hardware state machines. There * are two types of files. One type is for the device, implementing ep0. * The other type is for each IN or OUT endpoint. In both cases, the * user mode driver must configure the hardware before using it. * * - First, dev_config() is called when /dev/gadget/$CHIP is configured * (by writing configuration and device descriptors). Afterwards it * may serve as a source of device events, used to handle all control * requests other than basic enumeration. * * - Then, after a SET_CONFIGURATION control request, ep_config() is * called when each /dev/gadget/ep* file is configured (by writing * endpoint descriptors). Afterwards these files are used to write() * IN data or to read() OUT data. To halt the endpoint, a "wrong * direction" request is issued (like reading an IN endpoint). * * Unlike "usbfs" the only ioctl()s are for things that are rare, and maybe * not possible on all hardware. For example, precise fault handling with * respect to data left in endpoint fifos after aborted operations; or * selective clearing of endpoint halts, to implement SET_INTERFACE. */ #define DRIVER_DESC "USB Gadget filesystem" #define DRIVER_VERSION "24 Aug 2004" static const char driver_desc [] = DRIVER_DESC; static const char shortname [] = "gadgetfs"; MODULE_DESCRIPTION (DRIVER_DESC); MODULE_AUTHOR ("David Brownell"); MODULE_LICENSE ("GPL"); static int ep_open(struct inode *, struct file *); /*----------------------------------------------------------------------*/ #define GADGETFS_MAGIC 0xaee71ee7 /* /dev/gadget/$CHIP represents ep0 and the whole device */ enum ep0_state { /* DISABLED is the initial state. */ STATE_DEV_DISABLED = 0, /* Only one open() of /dev/gadget/$CHIP; only one file tracks * ep0/device i/o modes and binding to the controller. Driver * must always write descriptors to initialize the device, then * the device becomes UNCONNECTED until enumeration. */ STATE_DEV_OPENED, /* From then on, ep0 fd is in either of two basic modes: * - (UN)CONNECTED: read usb_gadgetfs_event(s) from it * - SETUP: read/write will transfer control data and succeed; * or if "wrong direction", performs protocol stall */ STATE_DEV_UNCONNECTED, STATE_DEV_CONNECTED, STATE_DEV_SETUP, /* UNBOUND means the driver closed ep0, so the device won't be * accessible again (DEV_DISABLED) until all fds are closed. */ STATE_DEV_UNBOUND, }; /* enough for the whole queue: most events invalidate others */ #define N_EVENT 5 #define RBUF_SIZE 256 struct dev_data { spinlock_t lock; refcount_t count; int udc_usage; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; struct fasync_struct *fasync; u8 current_config; /* drivers reading ep0 MUST handle control requests (SETUP) * reported that way; else the host will time out. */ unsigned usermode_setup : 1, setup_in : 1, setup_can_stall : 1, setup_out_ready : 1, setup_out_error : 1, setup_abort : 1, gadget_registered : 1; unsigned setup_wLength; /* the rest is basically write-once */ struct usb_config_descriptor *config, *hs_config; struct usb_device_descriptor *dev; struct usb_request *req; struct usb_gadget *gadget; struct list_head epfiles; void *buf; wait_queue_head_t wait; struct super_block *sb; struct dentry *dentry; /* except this scratch i/o buffer for ep0 */ u8 rbuf[RBUF_SIZE]; }; static inline void get_dev (struct dev_data *data) { refcount_inc (&data->count); } static void put_dev (struct dev_data *data) { if (likely (!refcount_dec_and_test (&data->count))) return; /* needs no more cleanup */ BUG_ON (waitqueue_active (&data->wait)); kfree (data); } static struct dev_data *dev_new (void) { struct dev_data *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; dev->state = STATE_DEV_DISABLED; refcount_set (&dev->count, 1); spin_lock_init (&dev->lock); INIT_LIST_HEAD (&dev->epfiles); init_waitqueue_head (&dev->wait); return dev; } /*----------------------------------------------------------------------*/ /* other /dev/gadget/$ENDPOINT files represent endpoints */ enum ep_state { STATE_EP_DISABLED = 0, STATE_EP_READY, STATE_EP_ENABLED, STATE_EP_UNBOUND, }; struct ep_data { struct mutex lock; enum ep_state state; refcount_t count; struct dev_data *dev; /* must hold dev->lock before accessing ep or req */ struct usb_ep *ep; struct usb_request *req; ssize_t status; char name [16]; struct usb_endpoint_descriptor desc, hs_desc; struct list_head epfiles; wait_queue_head_t wait; struct dentry *dentry; }; static inline void get_ep (struct ep_data *data) { refcount_inc (&data->count); } static void put_ep (struct ep_data *data) { if (likely (!refcount_dec_and_test (&data->count))) return; put_dev (data->dev); /* needs no more cleanup */ BUG_ON (!list_empty (&data->epfiles)); BUG_ON (waitqueue_active (&data->wait)); kfree (data); } /*----------------------------------------------------------------------*/ /* most "how to use the hardware" policy choices are in userspace: * mapping endpoint roles (which the driver needs) to the capabilities * which the usb controller has. most of those capabilities are exposed * implicitly, starting with the driver name and then endpoint names. */ static const char *CHIP; static DEFINE_MUTEX(sb_mutex); /* Serialize superblock operations */ /*----------------------------------------------------------------------*/ /* NOTE: don't use dev_printk calls before binding to the gadget * at the end of ep0 configuration, or after unbind. */ /* too wordy: dev_printk(level , &(d)->gadget->dev , fmt , ## args) */ #define xprintk(d,level,fmt,args...) \ printk(level "%s: " fmt , shortname , ## args) #ifdef DEBUG #define DBG(dev,fmt,args...) \ xprintk(dev , KERN_DEBUG , fmt , ## args) #else #define DBG(dev,fmt,args...) \ do { } while (0) #endif /* DEBUG */ #ifdef VERBOSE_DEBUG #define VDEBUG DBG #else #define VDEBUG(dev,fmt,args...) \ do { } while (0) #endif /* DEBUG */ #define ERROR(dev,fmt,args...) \ xprintk(dev , KERN_ERR , fmt , ## args) #define INFO(dev,fmt,args...) \ xprintk(dev , KERN_INFO , fmt , ## args) /*----------------------------------------------------------------------*/ /* SYNCHRONOUS ENDPOINT OPERATIONS (bulk/intr/iso) * * After opening, configure non-control endpoints. Then use normal * stream read() and write() requests; and maybe ioctl() to get more * precise FIFO status when recovering from cancellation. */ static void epio_complete (struct usb_ep *ep, struct usb_request *req) { struct ep_data *epdata = ep->driver_data; if (!req->context) return; if (req->status) epdata->status = req->status; else epdata->status = req->actual; complete ((struct completion *)req->context); } /* tasklock endpoint, returning when it's connected. * still need dev->lock to use epdata->ep. */ static int get_ready_ep (unsigned f_flags, struct ep_data *epdata, bool is_write) { int val; if (f_flags & O_NONBLOCK) { if (!mutex_trylock(&epdata->lock)) goto nonblock; if (epdata->state != STATE_EP_ENABLED && (!is_write || epdata->state != STATE_EP_READY)) { mutex_unlock(&epdata->lock); nonblock: val = -EAGAIN; } else val = 0; return val; } val = mutex_lock_interruptible(&epdata->lock); if (val < 0) return val; switch (epdata->state) { case STATE_EP_ENABLED: return 0; case STATE_EP_READY: /* not configured yet */ if (is_write) return 0; fallthrough; case STATE_EP_UNBOUND: /* clean disconnect */ break; // case STATE_EP_DISABLED: /* "can't happen" */ default: /* error! */ pr_debug ("%s: ep %p not available, state %d\n", shortname, epdata, epdata->state); } mutex_unlock(&epdata->lock); return -ENODEV; } static ssize_t ep_io (struct ep_data *epdata, void *buf, unsigned len) { DECLARE_COMPLETION_ONSTACK (done); int value; spin_lock_irq (&epdata->dev->lock); if (likely (epdata->ep != NULL)) { struct usb_request *req = epdata->req; req->context = &done; req->complete = epio_complete; req->buf = buf; req->length = len; value = usb_ep_queue (epdata->ep, req, GFP_ATOMIC); } else value = -ENODEV; spin_unlock_irq (&epdata->dev->lock); if (likely (value == 0)) { value = wait_for_completion_interruptible(&done); if (value != 0) { spin_lock_irq (&epdata->dev->lock); if (likely (epdata->ep != NULL)) { DBG (epdata->dev, "%s i/o interrupted\n", epdata->name); usb_ep_dequeue (epdata->ep, epdata->req); spin_unlock_irq (&epdata->dev->lock); wait_for_completion(&done); if (epdata->status == -ECONNRESET) epdata->status = -EINTR; } else { spin_unlock_irq (&epdata->dev->lock); DBG (epdata->dev, "endpoint gone\n"); wait_for_completion(&done); epdata->status = -ENODEV; } } return epdata->status; } return value; } static int ep_release (struct inode *inode, struct file *fd) { struct ep_data *data = fd->private_data; int value; value = mutex_lock_interruptible(&data->lock); if (value < 0) return value; /* clean up if this can be reopened */ if (data->state != STATE_EP_UNBOUND) { data->state = STATE_EP_DISABLED; data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; usb_ep_disable(data->ep); } mutex_unlock(&data->lock); put_ep (data); return 0; } static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) { struct ep_data *data = fd->private_data; int status; if ((status = get_ready_ep (fd->f_flags, data, false)) < 0) return status; spin_lock_irq (&data->dev->lock); if (likely (data->ep != NULL)) { switch (code) { case GADGETFS_FIFO_STATUS: status = usb_ep_fifo_status (data->ep); break; case GADGETFS_FIFO_FLUSH: usb_ep_fifo_flush (data->ep); break; case GADGETFS_CLEAR_HALT: status = usb_ep_clear_halt (data->ep); break; default: status = -ENOTTY; } } else status = -ENODEV; spin_unlock_irq (&data->dev->lock); mutex_unlock(&data->lock); return status; } /*----------------------------------------------------------------------*/ /* ASYNCHRONOUS ENDPOINT I/O OPERATIONS (bulk/intr/iso) */ struct kiocb_priv { struct usb_request *req; struct ep_data *epdata; struct kiocb *iocb; struct mm_struct *mm; struct work_struct work; void *buf; struct iov_iter to; const void *to_free; unsigned actual; }; static int ep_aio_cancel(struct kiocb *iocb) { struct kiocb_priv *priv = iocb->private; struct ep_data *epdata; int value; local_irq_disable(); epdata = priv->epdata; // spin_lock(&epdata->dev->lock); if (likely(epdata && epdata->ep && priv->req)) value = usb_ep_dequeue (epdata->ep, priv->req); else value = -EINVAL; // spin_unlock(&epdata->dev->lock); local_irq_enable(); return value; } static void ep_user_copy_worker(struct work_struct *work) { struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work); struct mm_struct *mm = priv->mm; struct kiocb *iocb = priv->iocb; size_t ret; kthread_use_mm(mm); ret = copy_to_iter(priv->buf, priv->actual, &priv->to); kthread_unuse_mm(mm); if (!ret) ret = -EFAULT; /* completing the iocb can drop the ctx and mm, don't touch mm after */ iocb->ki_complete(iocb, ret); kfree(priv->buf); kfree(priv->to_free); kfree(priv); } static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) { struct kiocb *iocb = req->context; struct kiocb_priv *priv = iocb->private; struct ep_data *epdata = priv->epdata; /* lock against disconnect (and ideally, cancel) */ spin_lock(&epdata->dev->lock); priv->req = NULL; priv->epdata = NULL; /* if this was a write or a read returning no data then we * don't need to copy anything to userspace, so we can * complete the aio request immediately. */ if (priv->to_free == NULL || unlikely(req->actual == 0)) { kfree(req->buf); kfree(priv->to_free); kfree(priv); iocb->private = NULL; iocb->ki_complete(iocb, req->actual ? req->actual : (long)req->status); } else { /* ep_copy_to_user() won't report both; we hide some faults */ if (unlikely(0 != req->status)) DBG(epdata->dev, "%s fault %d len %d\n", ep->name, req->status, req->actual); priv->buf = req->buf; priv->actual = req->actual; INIT_WORK(&priv->work, ep_user_copy_worker); schedule_work(&priv->work); } usb_ep_free_request(ep, req); spin_unlock(&epdata->dev->lock); put_ep(epdata); } static ssize_t ep_aio(struct kiocb *iocb, struct kiocb_priv *priv, struct ep_data *epdata, char *buf, size_t len) { struct usb_request *req; ssize_t value; iocb->private = priv; priv->iocb = iocb; kiocb_set_cancel_fn(iocb, ep_aio_cancel); get_ep(epdata); priv->epdata = epdata; priv->actual = 0; priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */ /* each kiocb is coupled to one usb_request, but we can't * allocate or submit those if the host disconnected. */ spin_lock_irq(&epdata->dev->lock); value = -ENODEV; if (unlikely(epdata->ep == NULL)) goto fail; req = usb_ep_alloc_request(epdata->ep, GFP_ATOMIC); value = -ENOMEM; if (unlikely(!req)) goto fail; priv->req = req; req->buf = buf; req->length = len; req->complete = ep_aio_complete; req->context = iocb; value = usb_ep_queue(epdata->ep, req, GFP_ATOMIC); if (unlikely(0 != value)) { usb_ep_free_request(epdata->ep, req); goto fail; } spin_unlock_irq(&epdata->dev->lock); return -EIOCBQUEUED; fail: spin_unlock_irq(&epdata->dev->lock); kfree(priv->to_free); kfree(priv); put_ep(epdata); return value; } static ssize_t ep_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct ep_data *epdata = file->private_data; size_t len = iov_iter_count(to); ssize_t value; char *buf; if ((value = get_ready_ep(file->f_flags, epdata, false)) < 0) return value; /* halt any endpoint by doing a "wrong direction" i/o call */ if (usb_endpoint_dir_in(&epdata->desc)) { if (usb_endpoint_xfer_isoc(&epdata->desc) || !is_sync_kiocb(iocb)) { mutex_unlock(&epdata->lock); return -EINVAL; } DBG (epdata->dev, "%s halt\n", epdata->name); spin_lock_irq(&epdata->dev->lock); if (likely(epdata->ep != NULL)) usb_ep_set_halt(epdata->ep); spin_unlock_irq(&epdata->dev->lock); mutex_unlock(&epdata->lock); return -EBADMSG; } buf = kmalloc(len, GFP_KERNEL); if (unlikely(!buf)) { mutex_unlock(&epdata->lock); return -ENOMEM; } if (is_sync_kiocb(iocb)) { value = ep_io(epdata, buf, len); if (value >= 0 && (copy_to_iter(buf, value, to) != value)) value = -EFAULT; } else { struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL); value = -ENOMEM; if (!priv) goto fail; priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL); if (!iter_is_ubuf(&priv->to) && !priv->to_free) { kfree(priv); goto fail; } value = ep_aio(iocb, priv, epdata, buf, len); if (value == -EIOCBQUEUED) buf = NULL; } fail: kfree(buf); mutex_unlock(&epdata->lock); return value; } static ssize_t ep_config(struct ep_data *, const char *, size_t); static ssize_t ep_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct ep_data *epdata = file->private_data; size_t len = iov_iter_count(from); bool configured; ssize_t value; char *buf; if ((value = get_ready_ep(file->f_flags, epdata, true)) < 0) return value; configured = epdata->state == STATE_EP_ENABLED; /* halt any endpoint by doing a "wrong direction" i/o call */ if (configured && !usb_endpoint_dir_in(&epdata->desc)) { if (usb_endpoint_xfer_isoc(&epdata->desc) || !is_sync_kiocb(iocb)) { mutex_unlock(&epdata->lock); return -EINVAL; } DBG (epdata->dev, "%s halt\n", epdata->name); spin_lock_irq(&epdata->dev->lock); if (likely(epdata->ep != NULL)) usb_ep_set_halt(epdata->ep); spin_unlock_irq(&epdata->dev->lock); mutex_unlock(&epdata->lock); return -EBADMSG; } buf = kmalloc(len, GFP_KERNEL); if (unlikely(!buf)) { mutex_unlock(&epdata->lock); return -ENOMEM; } if (unlikely(!copy_from_iter_full(buf, len, from))) { value = -EFAULT; goto out; } if (unlikely(!configured)) { value = ep_config(epdata, buf, len); } else if (is_sync_kiocb(iocb)) { value = ep_io(epdata, buf, len); } else { struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL); value = -ENOMEM; if (priv) { value = ep_aio(iocb, priv, epdata, buf, len); if (value == -EIOCBQUEUED) buf = NULL; } } out: kfree(buf); mutex_unlock(&epdata->lock); return value; } /*----------------------------------------------------------------------*/ /* used after endpoint configuration */ static const struct file_operations ep_io_operations = { .owner = THIS_MODULE, .open = ep_open, .release = ep_release, .llseek = no_llseek, .unlocked_ioctl = ep_ioctl, .read_iter = ep_read_iter, .write_iter = ep_write_iter, }; /* ENDPOINT INITIALIZATION * * fd = open ("/dev/gadget/$ENDPOINT", O_RDWR) * status = write (fd, descriptors, sizeof descriptors) * * That write establishes the endpoint configuration, configuring * the controller to process bulk, interrupt, or isochronous transfers * at the right maxpacket size, and so on. * * The descriptors are message type 1, identified by a host order u32 * at the beginning of what's written. Descriptor order is: full/low * speed descriptor, then optional high speed descriptor. */ static ssize_t ep_config (struct ep_data *data, const char *buf, size_t len) { struct usb_ep *ep; u32 tag; int value, length = len; if (data->state != STATE_EP_READY) { value = -EL2HLT; goto fail; } value = len; if (len < USB_DT_ENDPOINT_SIZE + 4) goto fail0; /* we might need to change message format someday */ memcpy(&tag, buf, 4); if (tag != 1) { DBG(data->dev, "config %s, bad tag %d\n", data->name, tag); goto fail0; } buf += 4; len -= 4; /* NOTE: audio endpoint extensions not accepted here; * just don't include the extra bytes. */ /* full/low speed descriptor, then high speed */ memcpy(&data->desc, buf, USB_DT_ENDPOINT_SIZE); if (data->desc.bLength != USB_DT_ENDPOINT_SIZE || data->desc.bDescriptorType != USB_DT_ENDPOINT) goto fail0; if (len != USB_DT_ENDPOINT_SIZE) { if (len != 2 * USB_DT_ENDPOINT_SIZE) goto fail0; memcpy(&data->hs_desc, buf + USB_DT_ENDPOINT_SIZE, USB_DT_ENDPOINT_SIZE); if (data->hs_desc.bLength != USB_DT_ENDPOINT_SIZE || data->hs_desc.bDescriptorType != USB_DT_ENDPOINT) { DBG(data->dev, "config %s, bad hs length or type\n", data->name); goto fail0; } } spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) { value = -ENOENT; goto gone; } else { ep = data->ep; if (ep == NULL) { value = -ENODEV; goto gone; } } switch (data->dev->gadget->speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: ep->desc = &data->desc; break; case USB_SPEED_HIGH: /* fails if caller didn't provide that descriptor... */ ep->desc = &data->hs_desc; break; default: DBG(data->dev, "unconnected, %s init abandoned\n", data->name); value = -EINVAL; goto gone; } value = usb_ep_enable(ep); if (value == 0) { data->state = STATE_EP_ENABLED; value = length; } gone: spin_unlock_irq (&data->dev->lock); if (value < 0) { fail: data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; } return value; fail0: value = -EINVAL; goto fail; } static int ep_open (struct inode *inode, struct file *fd) { struct ep_data *data = inode->i_private; int value = -EBUSY; if (mutex_lock_interruptible(&data->lock) != 0) return -EINTR; spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) value = -ENOENT; else if (data->state == STATE_EP_DISABLED) { value = 0; data->state = STATE_EP_READY; get_ep (data); fd->private_data = data; VDEBUG (data->dev, "%s ready\n", data->name); } else DBG (data->dev, "%s state %d\n", data->name, data->state); spin_unlock_irq (&data->dev->lock); mutex_unlock(&data->lock); return value; } /*----------------------------------------------------------------------*/ /* EP0 IMPLEMENTATION can be partly in userspace. * * Drivers that use this facility receive various events, including * control requests the kernel doesn't handle. Drivers that don't * use this facility may be too simple-minded for real applications. */ static inline void ep0_readable (struct dev_data *dev) { wake_up (&dev->wait); kill_fasync (&dev->fasync, SIGIO, POLL_IN); } static void clean_req (struct usb_ep *ep, struct usb_request *req) { struct dev_data *dev = ep->driver_data; if (req->buf != dev->rbuf) { kfree(req->buf); req->buf = dev->rbuf; } req->complete = epio_complete; dev->setup_out_ready = 0; } static void ep0_complete (struct usb_ep *ep, struct usb_request *req) { struct dev_data *dev = ep->driver_data; unsigned long flags; int free = 1; /* for control OUT, data must still get to userspace */ spin_lock_irqsave(&dev->lock, flags); if (!dev->setup_in) { dev->setup_out_error = (req->status != 0); if (!dev->setup_out_error) free = 0; dev->setup_out_ready = 1; ep0_readable (dev); } /* clean up as appropriate */ if (free && req->buf != &dev->rbuf) clean_req (ep, req); req->complete = epio_complete; spin_unlock_irqrestore(&dev->lock, flags); } static int setup_req (struct usb_ep *ep, struct usb_request *req, u16 len) { struct dev_data *dev = ep->driver_data; if (dev->setup_out_ready) { DBG (dev, "ep0 request busy!\n"); return -EBUSY; } if (len > sizeof (dev->rbuf)) req->buf = kmalloc(len, GFP_ATOMIC); if (req->buf == NULL) { req->buf = dev->rbuf; return -ENOMEM; } req->complete = ep0_complete; req->length = len; req->zero = 0; return 0; } static ssize_t ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t retval; enum ep0_state state; spin_lock_irq (&dev->lock); if (dev->state <= STATE_DEV_OPENED) { retval = -EINVAL; goto done; } /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; retval = -EIDRM; goto done; } /* control DATA stage */ if ((state = dev->state) == STATE_DEV_SETUP) { if (dev->setup_in) { /* stall IN */ VDEBUG(dev, "ep0in stall\n"); (void) usb_ep_set_halt (dev->gadget->ep0); retval = -EL2HLT; dev->state = STATE_DEV_CONNECTED; } else if (len == 0) { /* ack SET_CONFIGURATION etc */ struct usb_ep *ep = dev->gadget->ep0; struct usb_request *req = dev->req; if ((retval = setup_req (ep, req, 0)) == 0) { ++dev->udc_usage; spin_unlock_irq (&dev->lock); retval = usb_ep_queue (ep, req, GFP_KERNEL); spin_lock_irq (&dev->lock); --dev->udc_usage; } dev->state = STATE_DEV_CONNECTED; /* assume that was SET_CONFIGURATION */ if (dev->current_config) { unsigned power; if (gadget_is_dualspeed(dev->gadget) && (dev->gadget->speed == USB_SPEED_HIGH)) power = dev->hs_config->bMaxPower; else power = dev->config->bMaxPower; usb_gadget_vbus_draw(dev->gadget, 2 * power); } } else { /* collect OUT data */ if ((fd->f_flags & O_NONBLOCK) != 0 && !dev->setup_out_ready) { retval = -EAGAIN; goto done; } spin_unlock_irq (&dev->lock); retval = wait_event_interruptible (dev->wait, dev->setup_out_ready != 0); /* FIXME state could change from under us */ spin_lock_irq (&dev->lock); if (retval) goto done; if (dev->state != STATE_DEV_SETUP) { retval = -ECANCELED; goto done; } dev->state = STATE_DEV_CONNECTED; if (dev->setup_out_error) retval = -EIO; else { len = min (len, (size_t)dev->req->actual); ++dev->udc_usage; spin_unlock_irq(&dev->lock); if (copy_to_user (buf, dev->req->buf, len)) retval = -EFAULT; else retval = len; spin_lock_irq(&dev->lock); --dev->udc_usage; clean_req (dev->gadget->ep0, dev->req); /* NOTE userspace can't yet choose to stall */ } } goto done; } /* else normal: return event data */ if (len < sizeof dev->event [0]) { retval = -EINVAL; goto done; } len -= len % sizeof (struct usb_gadgetfs_event); dev->usermode_setup = 1; scan: /* return queued events right away */ if (dev->ev_next != 0) { unsigned i, n; n = len / sizeof (struct usb_gadgetfs_event); if (dev->ev_next < n) n = dev->ev_next; /* ep0 i/o has special semantics during STATE_DEV_SETUP */ for (i = 0; i < n; i++) { if (dev->event [i].type == GADGETFS_SETUP) { dev->state = STATE_DEV_SETUP; n = i + 1; break; } } spin_unlock_irq (&dev->lock); len = n * sizeof (struct usb_gadgetfs_event); if (copy_to_user (buf, &dev->event, len)) retval = -EFAULT; else retval = len; if (len > 0) { /* NOTE this doesn't guard against broken drivers; * concurrent ep0 readers may lose events. */ spin_lock_irq (&dev->lock); if (dev->ev_next > n) { memmove(&dev->event[0], &dev->event[n], sizeof (struct usb_gadgetfs_event) * (dev->ev_next - n)); } dev->ev_next -= n; spin_unlock_irq (&dev->lock); } return retval; } if (fd->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto done; } switch (state) { default: DBG (dev, "fail %s, state %d\n", __func__, state); retval = -ESRCH; break; case STATE_DEV_UNCONNECTED: case STATE_DEV_CONNECTED: spin_unlock_irq (&dev->lock); DBG (dev, "%s wait\n", __func__); /* wait for events */ retval = wait_event_interruptible (dev->wait, dev->ev_next != 0); if (retval < 0) return retval; spin_lock_irq (&dev->lock); goto scan; } done: spin_unlock_irq (&dev->lock); return retval; } static struct usb_gadgetfs_event * next_event (struct dev_data *dev, enum usb_gadgetfs_event_type type) { struct usb_gadgetfs_event *event; unsigned i; switch (type) { /* these events purge the queue */ case GADGETFS_DISCONNECT: if (dev->state == STATE_DEV_SETUP) dev->setup_abort = 1; fallthrough; case GADGETFS_CONNECT: dev->ev_next = 0; break; case GADGETFS_SETUP: /* previous request timed out */ case GADGETFS_SUSPEND: /* same effect */ /* these events can't be repeated */ for (i = 0; i != dev->ev_next; i++) { if (dev->event [i].type != type) continue; DBG(dev, "discard old event[%d] %d\n", i, type); dev->ev_next--; if (i == dev->ev_next) break; /* indices start at zero, for simplicity */ memmove (&dev->event [i], &dev->event [i + 1], sizeof (struct usb_gadgetfs_event) * (dev->ev_next - i)); } break; default: BUG (); } VDEBUG(dev, "event[%d] = %d\n", dev->ev_next, type); event = &dev->event [dev->ev_next++]; BUG_ON (dev->ev_next > N_EVENT); memset (event, 0, sizeof *event); event->type = type; return event; } static ssize_t ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t retval = -ESRCH; /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; retval = -EIDRM; /* data and/or status stage for control request */ } else if (dev->state == STATE_DEV_SETUP) { len = min_t(size_t, len, dev->setup_wLength); if (dev->setup_in) { retval = setup_req (dev->gadget->ep0, dev->req, len); if (retval == 0) { dev->state = STATE_DEV_CONNECTED; ++dev->udc_usage; spin_unlock_irq (&dev->lock); if (copy_from_user (dev->req->buf, buf, len)) retval = -EFAULT; else { if (len < dev->setup_wLength) dev->req->zero = 1; retval = usb_ep_queue ( dev->gadget->ep0, dev->req, GFP_KERNEL); } spin_lock_irq(&dev->lock); --dev->udc_usage; if (retval < 0) { clean_req (dev->gadget->ep0, dev->req); } else retval = len; return retval; } /* can stall some OUT transfers */ } else if (dev->setup_can_stall) { VDEBUG(dev, "ep0out stall\n"); (void) usb_ep_set_halt (dev->gadget->ep0); retval = -EL2HLT; dev->state = STATE_DEV_CONNECTED; } else { DBG(dev, "bogus ep0out stall!\n"); } } else DBG (dev, "fail %s, state %d\n", __func__, dev->state); return retval; } static int ep0_fasync (int f, struct file *fd, int on) { struct dev_data *dev = fd->private_data; // caller must F_SETOWN before signal delivery happens VDEBUG (dev, "%s %s\n", __func__, on ? "on" : "off"); return fasync_helper (f, fd, on, &dev->fasync); } static struct usb_gadget_driver gadgetfs_driver; static int dev_release (struct inode *inode, struct file *fd) { struct dev_data *dev = fd->private_data; /* closing ep0 === shutdown all */ if (dev->gadget_registered) { usb_gadget_unregister_driver (&gadgetfs_driver); dev->gadget_registered = false; } /* at this point "good" hardware has disconnected the * device from USB; the host won't see it any more. * alternatively, all host requests will time out. */ kfree (dev->buf); dev->buf = NULL; /* other endpoints were all decoupled from this device */ spin_lock_irq(&dev->lock); dev->state = STATE_DEV_DISABLED; spin_unlock_irq(&dev->lock); put_dev (dev); return 0; } static __poll_t ep0_poll (struct file *fd, poll_table *wait) { struct dev_data *dev = fd->private_data; __poll_t mask = 0; if (dev->state <= STATE_DEV_OPENED) return DEFAULT_POLLMASK; poll_wait(fd, &dev->wait, wait); spin_lock_irq(&dev->lock); /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; mask = EPOLLHUP; goto out; } if (dev->state == STATE_DEV_SETUP) { if (dev->setup_in || dev->setup_can_stall) mask = EPOLLOUT; } else { if (dev->ev_next != 0) mask = EPOLLIN; } out: spin_unlock_irq(&dev->lock); return mask; } static long gadget_dev_ioctl (struct file *fd, unsigned code, unsigned long value) { struct dev_data *dev = fd->private_data; struct usb_gadget *gadget = dev->gadget; long ret = -ENOTTY; spin_lock_irq(&dev->lock); if (dev->state == STATE_DEV_OPENED || dev->state == STATE_DEV_UNBOUND) { /* Not bound to a UDC */ } else if (gadget->ops->ioctl) { ++dev->udc_usage; spin_unlock_irq(&dev->lock); ret = gadget->ops->ioctl (gadget, code, value); spin_lock_irq(&dev->lock); --dev->udc_usage; } spin_unlock_irq(&dev->lock); return ret; } /*----------------------------------------------------------------------*/ /* The in-kernel gadget driver handles most ep0 issues, in particular * enumerating the single configuration (as provided from user space). * * Unrecognized ep0 requests may be handled in user space. */ static void make_qualifier (struct dev_data *dev) { struct usb_qualifier_descriptor qual; struct usb_device_descriptor *desc; qual.bLength = sizeof qual; qual.bDescriptorType = USB_DT_DEVICE_QUALIFIER; qual.bcdUSB = cpu_to_le16 (0x0200); desc = dev->dev; qual.bDeviceClass = desc->bDeviceClass; qual.bDeviceSubClass = desc->bDeviceSubClass; qual.bDeviceProtocol = desc->bDeviceProtocol; /* assumes ep0 uses the same value for both speeds ... */ qual.bMaxPacketSize0 = dev->gadget->ep0->maxpacket; qual.bNumConfigurations = 1; qual.bRESERVED = 0; memcpy (dev->rbuf, &qual, sizeof qual); } static int config_buf (struct dev_data *dev, u8 type, unsigned index) { int len; int hs = 0; /* only one configuration */ if (index > 0) return -EINVAL; if (gadget_is_dualspeed(dev->gadget)) { hs = (dev->gadget->speed == USB_SPEED_HIGH); if (type == USB_DT_OTHER_SPEED_CONFIG) hs = !hs; } if (hs) { dev->req->buf = dev->hs_config; len = le16_to_cpu(dev->hs_config->wTotalLength); } else { dev->req->buf = dev->config; len = le16_to_cpu(dev->config->wTotalLength); } ((u8 *)dev->req->buf) [1] = type; return len; } static int gadgetfs_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) { struct dev_data *dev = get_gadget_data (gadget); struct usb_request *req = dev->req; int value = -EOPNOTSUPP; struct usb_gadgetfs_event *event; u16 w_value = le16_to_cpu(ctrl->wValue); u16 w_length = le16_to_cpu(ctrl->wLength); if (w_length > RBUF_SIZE) { if (ctrl->bRequestType & USB_DIR_IN) { /* Cast away the const, we are going to overwrite on purpose. */ __le16 *temp = (__le16 *)&ctrl->wLength; *temp = cpu_to_le16(RBUF_SIZE); w_length = RBUF_SIZE; } else { return value; } } spin_lock (&dev->lock); dev->setup_abort = 0; if (dev->state == STATE_DEV_UNCONNECTED) { if (gadget_is_dualspeed(gadget) && gadget->speed == USB_SPEED_HIGH && dev->hs_config == NULL) { spin_unlock(&dev->lock); ERROR (dev, "no high speed config??\n"); return -EINVAL; } dev->state = STATE_DEV_CONNECTED; INFO (dev, "connected\n"); event = next_event (dev, GADGETFS_CONNECT); event->u.speed = gadget->speed; ep0_readable (dev); /* host may have given up waiting for response. we can miss control * requests handled lower down (device/endpoint status and features); * then ep0_{read,write} will report the wrong status. controller * driver will have aborted pending i/o. */ } else if (dev->state == STATE_DEV_SETUP) dev->setup_abort = 1; req->buf = dev->rbuf; req->context = NULL; switch (ctrl->bRequest) { case USB_REQ_GET_DESCRIPTOR: if (ctrl->bRequestType != USB_DIR_IN) goto unrecognized; switch (w_value >> 8) { case USB_DT_DEVICE: value = min (w_length, (u16) sizeof *dev->dev); dev->dev->bMaxPacketSize0 = dev->gadget->ep0->maxpacket; req->buf = dev->dev; break; case USB_DT_DEVICE_QUALIFIER: if (!dev->hs_config) break; value = min (w_length, (u16) sizeof (struct usb_qualifier_descriptor)); make_qualifier (dev); break; case USB_DT_OTHER_SPEED_CONFIG: case USB_DT_CONFIG: value = config_buf (dev, w_value >> 8, w_value & 0xff); if (value >= 0) value = min (w_length, (u16) value); break; case USB_DT_STRING: goto unrecognized; default: // all others are errors break; } break; /* currently one config, two speeds */ case USB_REQ_SET_CONFIGURATION: if (ctrl->bRequestType != 0) goto unrecognized; if (0 == (u8) w_value) { value = 0; dev->current_config = 0; usb_gadget_vbus_draw(gadget, 8 /* mA */ ); // user mode expected to disable endpoints } else { u8 config, power; if (gadget_is_dualspeed(gadget) && gadget->speed == USB_SPEED_HIGH) { config = dev->hs_config->bConfigurationValue; power = dev->hs_config->bMaxPower; } else { config = dev->config->bConfigurationValue; power = dev->config->bMaxPower; } if (config == (u8) w_value) { value = 0; dev->current_config = config; usb_gadget_vbus_draw(gadget, 2 * power); } } /* report SET_CONFIGURATION like any other control request, * except that usermode may not stall this. the next * request mustn't be allowed start until this finishes: * endpoints and threads set up, etc. * * NOTE: older PXA hardware (before PXA 255: without UDCCFR) * has bad/racey automagic that prevents synchronizing here. * even kernel mode drivers often miss them. */ if (value == 0) { INFO (dev, "configuration #%d\n", dev->current_config); usb_gadget_set_state(gadget, USB_STATE_CONFIGURED); if (dev->usermode_setup) { dev->setup_can_stall = 0; goto delegate; } } break; #ifndef CONFIG_USB_PXA25X /* PXA automagically handles this request too */ case USB_REQ_GET_CONFIGURATION: if (ctrl->bRequestType != 0x80) goto unrecognized; *(u8 *)req->buf = dev->current_config; value = min (w_length, (u16) 1); break; #endif default: unrecognized: VDEBUG (dev, "%s req%02x.%02x v%04x i%04x l%d\n", dev->usermode_setup ? "delegate" : "fail", ctrl->bRequestType, ctrl->bRequest, w_value, le16_to_cpu(ctrl->wIndex), w_length); /* if there's an ep0 reader, don't stall */ if (dev->usermode_setup) { dev->setup_can_stall = 1; delegate: dev->setup_in = (ctrl->bRequestType & USB_DIR_IN) ? 1 : 0; dev->setup_wLength = w_length; dev->setup_out_ready = 0; dev->setup_out_error = 0; /* read DATA stage for OUT right away */ if (unlikely (!dev->setup_in && w_length)) { value = setup_req (gadget->ep0, dev->req, w_length); if (value < 0) break; ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, dev->req, GFP_KERNEL); spin_lock (&dev->lock); --dev->udc_usage; if (value < 0) { clean_req (gadget->ep0, dev->req); break; } /* we can't currently stall these */ dev->setup_can_stall = 0; } /* state changes when reader collects event */ event = next_event (dev, GADGETFS_SETUP); event->u.setup = *ctrl; ep0_readable (dev); spin_unlock (&dev->lock); /* * Return USB_GADGET_DELAYED_STATUS as a workaround to * stop some UDC drivers (e.g. dwc3) from automatically * proceeding with the status stage for 0-length * transfers. * Should be removed once all UDC drivers are fixed to * always delay the status stage until a response is * queued to EP0. */ return w_length == 0 ? USB_GADGET_DELAYED_STATUS : 0; } } /* proceed with data transfer and status phases? */ if (value >= 0 && dev->state != STATE_DEV_SETUP) { req->length = value; req->zero = value < w_length; ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, req, GFP_KERNEL); spin_lock(&dev->lock); --dev->udc_usage; spin_unlock(&dev->lock); if (value < 0) { DBG (dev, "ep_queue --> %d\n", value); req->status = 0; } return value; } /* device stalls when value < 0 */ spin_unlock (&dev->lock); return value; } static void destroy_ep_files (struct dev_data *dev) { DBG (dev, "%s %d\n", __func__, dev->state); /* dev->state must prevent interference */ spin_lock_irq (&dev->lock); while (!list_empty(&dev->epfiles)) { struct ep_data *ep; struct inode *parent; struct dentry *dentry; /* break link to FS */ ep = list_first_entry (&dev->epfiles, struct ep_data, epfiles); list_del_init (&ep->epfiles); spin_unlock_irq (&dev->lock); dentry = ep->dentry; ep->dentry = NULL; parent = d_inode(dentry->d_parent); /* break link to controller */ mutex_lock(&ep->lock); if (ep->state == STATE_EP_ENABLED) (void) usb_ep_disable (ep->ep); ep->state = STATE_EP_UNBOUND; usb_ep_free_request (ep->ep, ep->req); ep->ep = NULL; mutex_unlock(&ep->lock); wake_up (&ep->wait); put_ep (ep); /* break link to dcache */ inode_lock(parent); d_delete (dentry); dput (dentry); inode_unlock(parent); spin_lock_irq (&dev->lock); } spin_unlock_irq (&dev->lock); } static struct dentry * gadgetfs_create_file (struct super_block *sb, char const *name, void *data, const struct file_operations *fops); static int activate_ep_files (struct dev_data *dev) { struct usb_ep *ep; struct ep_data *data; gadget_for_each_ep (ep, dev->gadget) { data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto enomem0; data->state = STATE_EP_DISABLED; mutex_init(&data->lock); init_waitqueue_head (&data->wait); strncpy (data->name, ep->name, sizeof (data->name) - 1); refcount_set (&data->count, 1); data->dev = dev; get_dev (dev); data->ep = ep; ep->driver_data = data; data->req = usb_ep_alloc_request (ep, GFP_KERNEL); if (!data->req) goto enomem1; data->dentry = gadgetfs_create_file (dev->sb, data->name, data, &ep_io_operations); if (!data->dentry) goto enomem2; list_add_tail (&data->epfiles, &dev->epfiles); } return 0; enomem2: usb_ep_free_request (ep, data->req); enomem1: put_dev (dev); kfree (data); enomem0: DBG (dev, "%s enomem\n", __func__); destroy_ep_files (dev); return -ENOMEM; } static void gadgetfs_unbind (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); DBG (dev, "%s\n", __func__); spin_lock_irq (&dev->lock); dev->state = STATE_DEV_UNBOUND; while (dev->udc_usage > 0) { spin_unlock_irq(&dev->lock); usleep_range(1000, 2000); spin_lock_irq(&dev->lock); } spin_unlock_irq (&dev->lock); destroy_ep_files (dev); gadget->ep0->driver_data = NULL; set_gadget_data (gadget, NULL); /* we've already been disconnected ... no i/o is active */ if (dev->req) usb_ep_free_request (gadget->ep0, dev->req); DBG (dev, "%s done\n", __func__); put_dev (dev); } static struct dev_data *the_device; static int gadgetfs_bind(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { struct dev_data *dev = the_device; if (!dev) return -ESRCH; if (0 != strcmp (CHIP, gadget->name)) { pr_err("%s expected %s controller not %s\n", shortname, CHIP, gadget->name); return -ENODEV; } set_gadget_data (gadget, dev); dev->gadget = gadget; gadget->ep0->driver_data = dev; /* preallocate control response and buffer */ dev->req = usb_ep_alloc_request (gadget->ep0, GFP_KERNEL); if (!dev->req) goto enomem; dev->req->context = NULL; dev->req->complete = epio_complete; if (activate_ep_files (dev) < 0) goto enomem; INFO (dev, "bound to %s driver\n", gadget->name); spin_lock_irq(&dev->lock); dev->state = STATE_DEV_UNCONNECTED; spin_unlock_irq(&dev->lock); get_dev (dev); return 0; enomem: gadgetfs_unbind (gadget); return -ENOMEM; } static void gadgetfs_disconnect (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); unsigned long flags; spin_lock_irqsave (&dev->lock, flags); if (dev->state == STATE_DEV_UNCONNECTED) goto exit; dev->state = STATE_DEV_UNCONNECTED; INFO (dev, "disconnected\n"); next_event (dev, GADGETFS_DISCONNECT); ep0_readable (dev); exit: spin_unlock_irqrestore (&dev->lock, flags); } static void gadgetfs_suspend (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); unsigned long flags; INFO (dev, "suspended from state %d\n", dev->state); spin_lock_irqsave(&dev->lock, flags); switch (dev->state) { case STATE_DEV_SETUP: // VERY odd... host died?? case STATE_DEV_CONNECTED: case STATE_DEV_UNCONNECTED: next_event (dev, GADGETFS_SUSPEND); ep0_readable (dev); fallthrough; default: break; } spin_unlock_irqrestore(&dev->lock, flags); } static struct usb_gadget_driver gadgetfs_driver = { .function = (char *) driver_desc, .bind = gadgetfs_bind, .unbind = gadgetfs_unbind, .setup = gadgetfs_setup, .reset = gadgetfs_disconnect, .disconnect = gadgetfs_disconnect, .suspend = gadgetfs_suspend, .driver = { .name = shortname, }, }; /*----------------------------------------------------------------------*/ /* DEVICE INITIALIZATION * * fd = open ("/dev/gadget/$CHIP", O_RDWR) * status = write (fd, descriptors, sizeof descriptors) * * That write establishes the device configuration, so the kernel can * bind to the controller ... guaranteeing it can handle enumeration * at all necessary speeds. Descriptor order is: * * . message tag (u32, host order) ... for now, must be zero; it * would change to support features like multi-config devices * . full/low speed config ... all wTotalLength bytes (with interface, * class, altsetting, endpoint, and other descriptors) * . high speed config ... all descriptors, for high speed operation; * this one's optional except for high-speed hardware * . device descriptor * * Endpoints are not yet enabled. Drivers must wait until device * configuration and interface altsetting changes create * the need to configure (or unconfigure) them. * * After initialization, the device stays active for as long as that * $CHIP file is open. Events must then be read from that descriptor, * such as configuration notifications. */ static int is_valid_config(struct usb_config_descriptor *config, unsigned int total) { return config->bDescriptorType == USB_DT_CONFIG && config->bLength == USB_DT_CONFIG_SIZE && total >= USB_DT_CONFIG_SIZE && config->bConfigurationValue != 0 && (config->bmAttributes & USB_CONFIG_ATT_ONE) != 0 && (config->bmAttributes & USB_CONFIG_ATT_WAKEUP) == 0; /* FIXME if gadget->is_otg, _must_ include an otg descriptor */ /* FIXME check lengths: walk to end */ } static ssize_t dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t value, length = len; unsigned total; u32 tag; char *kbuf; spin_lock_irq(&dev->lock); if (dev->state > STATE_DEV_OPENED) { value = ep0_write(fd, buf, len, ptr); spin_unlock_irq(&dev->lock); return value; } spin_unlock_irq(&dev->lock); if ((len < (USB_DT_CONFIG_SIZE + USB_DT_DEVICE_SIZE + 4)) || (len > PAGE_SIZE * 4)) return -EINVAL; /* we might need to change message format someday */ if (copy_from_user (&tag, buf, 4)) return -EFAULT; if (tag != 0) return -EINVAL; buf += 4; length -= 4; kbuf = memdup_user(buf, length); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); spin_lock_irq (&dev->lock); value = -EINVAL; if (dev->buf) { spin_unlock_irq(&dev->lock); kfree(kbuf); return value; } dev->buf = kbuf; /* full or low speed config */ dev->config = (void *) kbuf; total = le16_to_cpu(dev->config->wTotalLength); if (!is_valid_config(dev->config, total) || total > length - USB_DT_DEVICE_SIZE) goto fail; kbuf += total; length -= total; /* optional high speed config */ if (kbuf [1] == USB_DT_CONFIG) { dev->hs_config = (void *) kbuf; total = le16_to_cpu(dev->hs_config->wTotalLength); if (!is_valid_config(dev->hs_config, total) || total > length - USB_DT_DEVICE_SIZE) goto fail; kbuf += total; length -= total; } else { dev->hs_config = NULL; } /* could support multiple configs, using another encoding! */ /* device descriptor (tweaked for paranoia) */ if (length != USB_DT_DEVICE_SIZE) goto fail; dev->dev = (void *)kbuf; if (dev->dev->bLength != USB_DT_DEVICE_SIZE || dev->dev->bDescriptorType != USB_DT_DEVICE || dev->dev->bNumConfigurations != 1) goto fail; dev->dev->bcdUSB = cpu_to_le16 (0x0200); /* triggers gadgetfs_bind(); then we can enumerate. */ spin_unlock_irq (&dev->lock); if (dev->hs_config) gadgetfs_driver.max_speed = USB_SPEED_HIGH; else gadgetfs_driver.max_speed = USB_SPEED_FULL; value = usb_gadget_register_driver(&gadgetfs_driver); if (value != 0) { spin_lock_irq(&dev->lock); goto fail; } else { /* at this point "good" hardware has for the first time * let the USB the host see us. alternatively, if users * unplug/replug that will clear all the error state. * * note: everything running before here was guaranteed * to choke driver model style diagnostics. from here * on, they can work ... except in cleanup paths that * kick in after the ep0 descriptor is closed. */ value = len; dev->gadget_registered = true; } return value; fail: dev->config = NULL; dev->hs_config = NULL; dev->dev = NULL; spin_unlock_irq (&dev->lock); pr_debug ("%s: %s fail %zd, %p\n", shortname, __func__, value, dev); kfree (dev->buf); dev->buf = NULL; return value; } static int gadget_dev_open (struct inode *inode, struct file *fd) { struct dev_data *dev = inode->i_private; int value = -EBUSY; spin_lock_irq(&dev->lock); if (dev->state == STATE_DEV_DISABLED) { dev->ev_next = 0; dev->state = STATE_DEV_OPENED; fd->private_data = dev; get_dev (dev); value = 0; } spin_unlock_irq(&dev->lock); return value; } static const struct file_operations ep0_operations = { .llseek = no_llseek, .open = gadget_dev_open, .read = ep0_read, .write = dev_config, .fasync = ep0_fasync, .poll = ep0_poll, .unlocked_ioctl = gadget_dev_ioctl, .release = dev_release, }; /*----------------------------------------------------------------------*/ /* FILESYSTEM AND SUPERBLOCK OPERATIONS * * Mounting the filesystem creates a controller file, used first for * device configuration then later for event monitoring. */ /* FIXME PAM etc could set this security policy without mount options * if epfiles inherited ownership and permissons from ep0 ... */ static unsigned default_uid; static unsigned default_gid; static unsigned default_perm = S_IRUSR | S_IWUSR; module_param (default_uid, uint, 0644); module_param (default_gid, uint, 0644); module_param (default_perm, uint, 0644); static struct inode * gadgetfs_make_inode (struct super_block *sb, void *data, const struct file_operations *fops, int mode) { struct inode *inode = new_inode (sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, default_uid); inode->i_gid = make_kgid(&init_user_ns, default_gid); simple_inode_init_ts(inode); inode->i_private = data; inode->i_fop = fops; } return inode; } /* creates in fs root directory, so non-renamable and non-linkable. * so inode and dentry are paired, until device reconfig. */ static struct dentry * gadgetfs_create_file (struct super_block *sb, char const *name, void *data, const struct file_operations *fops) { struct dentry *dentry; struct inode *inode; dentry = d_alloc_name(sb->s_root, name); if (!dentry) return NULL; inode = gadgetfs_make_inode (sb, data, fops, S_IFREG | (default_perm & S_IRWXUGO)); if (!inode) { dput(dentry); return NULL; } d_add (dentry, inode); return dentry; } static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; static int gadgetfs_fill_super (struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct dev_data *dev; int rc; mutex_lock(&sb_mutex); if (the_device) { rc = -ESRCH; goto Done; } CHIP = usb_get_gadget_udc_name(); if (!CHIP) { rc = -ENODEV; goto Done; } /* superblock */ sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = GADGETFS_MAGIC; sb->s_op = &gadget_fs_operations; sb->s_time_gran = 1; /* root inode */ inode = gadgetfs_make_inode (sb, NULL, &simple_dir_operations, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) goto Enomem; inode->i_op = &simple_dir_inode_operations; if (!(sb->s_root = d_make_root (inode))) goto Enomem; /* the ep0 file is named after the controller we expect; * user mode code can use it for sanity checks, like we do. */ dev = dev_new (); if (!dev) goto Enomem; dev->sb = sb; dev->dentry = gadgetfs_create_file(sb, CHIP, dev, &ep0_operations); if (!dev->dentry) { put_dev(dev); goto Enomem; } /* other endpoint files are available after hardware setup, * from binding to a controller. */ the_device = dev; rc = 0; goto Done; Enomem: kfree(CHIP); CHIP = NULL; rc = -ENOMEM; Done: mutex_unlock(&sb_mutex); return rc; } /* "mount -t gadgetfs path /dev/gadget" ends up here */ static int gadgetfs_get_tree(struct fs_context *fc) { return get_tree_single(fc, gadgetfs_fill_super); } static const struct fs_context_operations gadgetfs_context_ops = { .get_tree = gadgetfs_get_tree, }; static int gadgetfs_init_fs_context(struct fs_context *fc) { fc->ops = &gadgetfs_context_ops; return 0; } static void gadgetfs_kill_sb (struct super_block *sb) { mutex_lock(&sb_mutex); kill_litter_super (sb); if (the_device) { put_dev (the_device); the_device = NULL; } kfree(CHIP); CHIP = NULL; mutex_unlock(&sb_mutex); } /*----------------------------------------------------------------------*/ static struct file_system_type gadgetfs_type = { .owner = THIS_MODULE, .name = shortname, .init_fs_context = gadgetfs_init_fs_context, .kill_sb = gadgetfs_kill_sb, }; MODULE_ALIAS_FS("gadgetfs"); /*----------------------------------------------------------------------*/ static int __init gadgetfs_init (void) { int status; status = register_filesystem (&gadgetfs_type); if (status == 0) pr_info ("%s: %s, version " DRIVER_VERSION "\n", shortname, driver_desc); return status; } module_init (gadgetfs_init); static void __exit gadgetfs_cleanup (void) { pr_debug ("unregister %s\n", shortname); unregister_filesystem (&gadgetfs_type); } module_exit (gadgetfs_cleanup); |
| 6 2710 2709 1409 2587 2629 894 22 22 22 11 2623 1524 2622 101 458 1369 1528 2631 1527 2697 2700 2701 2621 2620 92 2474 2470 69 2433 944 590 1012 2194 1279 674 117 1062 1320 1011 948 1218 22 16 9 22 2558 1789 2475 2568 2622 2594 2615 1471 1703 2556 108 40 2627 47 58 69 101 2624 92 2620 87 2625 18 286 142 109 281 285 1 285 3 4 286 195 2623 2615 2568 195 2620 1217 687 2620 2624 2620 2628 2616 1218 1728 2528 282 284 283 285 287 287 287 2622 5 2626 2626 5 5 5 5 286 9 9 9 9 2624 2625 2620 2622 197 2572 2623 2629 2621 2630 2620 195 2575 2627 6 2630 2631 286 6 287 286 285 287 684 1334 3 732 733 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 | // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * The percpu allocator handles both static and dynamic areas. Percpu * areas are allocated in chunks which are divided into units. There is * a 1-to-1 mapping for units to possible cpus. These units are grouped * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done by offsets into a unit's address space. Ie., an * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear * and even sparse. Access is handled by configuring percpu base * registers according to the cpu to unit mappings and offsetting the * base address using pcpu_unit_size. * * There is special consideration for the first chunk which must handle * the static percpu variables in the kernel image as allocation services * are not online yet. In short, the first chunk is structured like so: * * <Static | [Reserved] | Dynamic> * * The static data is copied from the original section managed by the * linker. The reserved section, if non-zero, primarily manages static * percpu variables from kernel modules. Finally, the dynamic section * takes care of normal allocations. * * The allocator organizes chunks into lists according to free size and * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT * flag should be passed. All memcg-aware allocations are sharing one set * of chunks and all unaccounted allocations and allocations performed * by processes belonging to the root memory cgroup are using the second set. * * The allocator tries to allocate from the fullest chunk first. Each chunk * is managed by a bitmap with metadata blocks. The allocation map is updated * on every allocation and free to reflect the current state while the boundary * map is only updated on allocation. Each metadata block contains * information to help mitigate the need to iterate over large portions * of the bitmap. The reverse mapping from page to chunk is stored in * the page's index. Lastly, units are lazily backed and grow in unison. * * There is a unique conversion that goes on here between bytes and bits. * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk * tracks the number of pages it is responsible for in nr_pages. Helper * functions are used to convert from between the bytes, bits, and blocks. * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> #include <linux/cpumask.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/pfn.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/kmemleak.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/memcontrol.h> #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/io.h> #define CREATE_TRACE_POINTS #include <trace/events/percpu.h> #include "percpu-internal.h" /* * The slots are sorted by the size of the biggest continuous free area. * 1-31 bytes share the same slot. */ #define PCPU_SLOT_BASE_SHIFT 5 /* chunks in slots below this are subject to being sidelined on failed alloc */ #define PCPU_SLOT_FAIL_THRESHOLD 3 #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ (void __percpu *)((unsigned long)(addr) - \ (unsigned long)pcpu_base_addr + \ (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ (void __force *)((unsigned long)(ptr) + \ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif #else /* CONFIG_SMP */ /* on UP, it's always identity mapped */ #define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif /* CONFIG_SMP */ static int pcpu_unit_pages __ro_after_init; static int pcpu_unit_size __ro_after_init; static int pcpu_nr_units __ro_after_init; static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; static int pcpu_free_slot __ro_after_init; int pcpu_sidelined_slot __ro_after_init; int pcpu_to_depopulate_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ static unsigned int pcpu_low_unit_cpu __ro_after_init; static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init; static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ /* group information, used for vm allocation */ static int pcpu_nr_groups __ro_after_init; static const unsigned long *pcpu_group_offsets __ro_after_init; static const size_t *pcpu_group_sizes __ro_after_init; /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first * chunk and serves it for reserved allocations. When the reserved * region doesn't exist, the following variable is NULL. */ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ /* * The number of empty populated pages, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count. */ int pcpu_nr_empty_pop_pages; /* * The number of populated pages in use by the allocator, protected by * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets * allocated/deallocated, it is allocated/deallocated in all units of a chunk * and increments/decrements this count by 1). */ static unsigned long pcpu_nr_populated; /* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one * empty chunk. */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); static bool pcpu_async_enabled __read_mostly; static bool pcpu_atomic_alloc_failed; static void pcpu_schedule_balance_work(void) { if (pcpu_async_enabled) schedule_work(&pcpu_balance_work); } /** * pcpu_addr_in_chunk - check if the address is served from this chunk * @chunk: chunk of interest * @addr: percpu address * * RETURNS: * True if the address is served from this chunk. */ static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) { void *start_addr, *end_addr; if (!chunk) return false; start_addr = chunk->base_addr + chunk->start_offset; end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - chunk->end_offset; return addr >= start_addr && addr < end_addr; } static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } static int pcpu_size_to_slot(int size) { if (size == pcpu_unit_size) return pcpu_free_slot; return __pcpu_size_to_slot(size); } static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { const struct pcpu_block_md *chunk_md = &chunk->chunk_md; if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk_md->contig_hint == 0) return 0; return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); } /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { page->index = (unsigned long)pcpu; } /* obtain pointer to a chunk from a page struct */ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { return (struct pcpu_chunk *)page->index; } static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) { return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) { return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { return (unsigned long)chunk->base_addr + pcpu_unit_page_offset(cpu, page_idx); } /* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets. */ static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) { return chunk->alloc_map + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); } static unsigned long pcpu_off_to_block_index(int off) { return off / PCPU_BITMAP_BLOCK_BITS; } static unsigned long pcpu_off_to_block_off(int off) { return off & (PCPU_BITMAP_BLOCK_BITS - 1); } static unsigned long pcpu_block_off_to_off(int index, int off) { return index * PCPU_BITMAP_BLOCK_BITS + off; } /** * pcpu_check_block_hint - check against the contig hint * @block: block of interest * @bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * * Check to see if the allocation can fit in the block's contig hint. * Note, a chunk uses the same hints as a block so this can also check against * the chunk's contig hint. */ static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits, size_t align) { int bit_off = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; return bit_off + bits <= block->contig_hint; } /* * pcpu_next_hint - determine which hint to use * @block: block of interest * @alloc_bits: size of allocation * * This determines if we should scan based on the scan_hint or first_free. * In general, we want to scan from first_free to fulfill allocations by * first fit. However, if we know a scan_hint at position scan_hint_start * cannot fulfill an allocation, we can begin scanning from there knowing * the contig_hint will be our fallback. */ static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) { /* * The three conditions below determine if we can skip past the * scan_hint. First, does the scan hint exist. Second, is the * contig_hint after the scan_hint (possibly not true iff * contig_hint == scan_hint). Third, is the allocation request * larger than the scan_hint. */ if (block->scan_hint && block->contig_hint_start > block->scan_hint_start && alloc_bits > block->scan_hint) return block->scan_hint_start + block->scan_hint; return block->first_free; } /** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Helper function for pcpu_for_each_md_free_region. It checks * block->contig_hint and performs aggregation across blocks to find the * next hint. It modifies bit_off and bits in-place to be consumed in the * loop. */ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; return; } /* * This checks three things. First is there a contig_hint to * check. Second, have we checked this hint before by * comparing the block_off. Third, is this the same as the * right contig hint. In the last case, it spills over into * the next block and should be handled by the contig area * across blocks code. */ *bits = block->contig_hint; if (*bits && block->contig_hint_start >= block_off && *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { *bit_off = pcpu_block_off_to_off(i, block->contig_hint_start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bits = block->right_free; *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; } } /** * pcpu_next_fit_region - finds fit areas for a given allocation request * @chunk: chunk of interest * @alloc_bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * @bit_off: chunk offset * @bits: size of free area * * Finds the next free region that is viable for use with a given size and * alignment. This only returns if there is a valid area to be used for this * allocation. block->first_free is returned if the allocation request fits * within the block to see if the request can be fulfilled prior to the contig * hint. */ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, int align, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (*bits >= alloc_bits) return; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; } /* check block->contig_hint */ *bits = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; /* * This uses the block offset to determine if this has been * checked in the prior iteration. */ if (block->contig_hint && block->contig_hint_start >= block_off && block->contig_hint >= *bits + alloc_bits) { int start = pcpu_next_hint(block, alloc_bits); *bits += alloc_bits + block->contig_hint_start - start; *bit_off = pcpu_block_off_to_off(i, start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, align); *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; *bit_off = pcpu_block_off_to_off(i, *bit_off); if (*bits >= alloc_bits) return; } /* no valid offsets were found - fail condition */ *bit_off = pcpu_chunk_map_bits(chunk); } /* * Metadata free area iterators. These perform aggregation of free areas * based on the metadata blocks and return the offset @bit_off and size in * bits of the free area @bits. pcpu_for_each_fit_region only returns when * a fit is found for the allocation request. */ #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits) + 1, \ pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits), \ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits))) /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) return kzalloc(size, gfp); else return __vmalloc(size, gfp | __GFP_ZERO); } /** * pcpu_mem_free - free memory * @ptr: memory to free * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr) { kvfree(ptr); } static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, bool move_front) { if (chunk != pcpu_reserved_chunk) { if (move_front) list_move(&chunk->list, &pcpu_chunk_lists[slot]); else list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]); } } static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) { __pcpu_chunk_move(chunk, slot, true); } /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. * * CONTEXT: * pcpu_lock. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); /* leave isolated chunks in-place */ if (chunk->isolated) return; if (oslot != nslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot); } static void pcpu_isolate_chunk(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); if (!chunk->isolated) { chunk->isolated = true; pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages; } list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]); } static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); if (chunk->isolated) { chunk->isolated = false; pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages; pcpu_chunk_relocate(chunk, -1); } } /* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest * @nr: nr of empty pages * * This is used to keep track of the empty pages now based on the premise * a md_block covers a page. The hint update functions recognize if a block * is made full or broken to calculate deltas for keeping track of free pages. */ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk && !chunk->isolated) pcpu_nr_empty_pop_pages += nr; } /* * pcpu_region_overlap - determines if two regions overlap * @a: start of first region, inclusive * @b: end of first region, exclusive * @x: start of second region, inclusive * @y: end of second region, exclusive * * This is used to determine if the hint region [a, b) overlaps with the * allocated region [x, y). */ static inline bool pcpu_region_overlap(int a, int b, int x, int y) { return (a < y) && (x < b); } /** * pcpu_block_update - updates a block given a free area * @block: block of interest * @start: start offset in block * @end: end offset in block * * Updates a block given a known free area. The region [start, end) is * expected to be the entirety of the free area within a block. Chooses * the best starting offset if the contig hints are equal. */ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) { int contig = end - start; block->first_free = min(block->first_free, start); if (start == 0) block->left_free = contig; if (end == block->nr_bits) block->right_free = contig; if (contig > block->contig_hint) { /* promote the old contig_hint to be the new scan_hint */ if (start > block->contig_hint_start) { if (block->contig_hint > block->scan_hint) { block->scan_hint_start = block->contig_hint_start; block->scan_hint = block->contig_hint; } else if (start < block->scan_hint_start) { /* * The old contig_hint == scan_hint. But, the * new contig is larger so hold the invariant * scan_hint_start < contig_hint_start. */ block->scan_hint = 0; } } else { block->scan_hint = 0; } block->contig_hint_start = start; block->contig_hint = contig; } else if (contig == block->contig_hint) { if (block->contig_hint_start && (!start || __ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */ block->contig_hint_start = start; if (start < block->scan_hint_start && block->contig_hint > block->scan_hint) block->scan_hint = 0; } else if (start > block->scan_hint_start || block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint. */ block->scan_hint_start = start; block->scan_hint = contig; } } else { /* * The region is smaller than the contig_hint. So only update * the scan_hint if it is larger than or equal and farther than * the current scan_hint. */ if ((start < block->contig_hint_start && (contig > block->scan_hint || (contig == block->scan_hint && start > block->scan_hint_start)))) { block->scan_hint_start = start; block->scan_hint = contig; } } } /* * pcpu_block_update_scan - update a block given a free area from a scan * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Finding the final allocation spot first goes through pcpu_find_block_fit() * to find a block that can hold the allocation and then pcpu_alloc_area() * where a scan is used. When allocations require specific alignments, * we can inadvertently create holes which will not be seen in the alloc * or free paths. * * This takes a given free area hole and updates a block as it may change the * scan_hint. We need to scan backwards to ensure we don't miss free bits * from alignment. */ static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, int bits) { int s_off = pcpu_off_to_block_off(bit_off); int e_off = s_off + bits; int s_index, l_bit; struct pcpu_block_md *block; if (e_off > PCPU_BITMAP_BLOCK_BITS) return; s_index = pcpu_off_to_block_index(bit_off); block = chunk->md_blocks + s_index; /* scan backwards in case of alignment skipping free bits */ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); s_off = (s_off == l_bit) ? 0 : l_bit + 1; pcpu_block_update(block, s_off, e_off); } /** * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest * @full_scan: if we should scan from the beginning * * Iterates over the metadata blocks to find the largest contig area. * A full scan can be avoided on the allocation path as this is triggered * if we broke the contig_hint. In doing so, the scan_hint will be before * the contig_hint or after if the scan_hint == contig_hint. This cannot * be prevented on freeing as we want to find the largest area possibly * spanning blocks. */ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits; /* promote scan_hint to contig_hint */ if (!full_scan && chunk_md->scan_hint) { bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; chunk_md->contig_hint_start = chunk_md->scan_hint_start; chunk_md->contig_hint = chunk_md->scan_hint; chunk_md->scan_hint = 0; } else { bit_off = chunk_md->first_free; chunk_md->contig_hint = 0; } bits = 0; pcpu_for_each_md_free_region(chunk, bit_off, bits) pcpu_block_update(chunk_md, bit_off, bit_off + bits); } /** * pcpu_block_refresh_hint * @chunk: chunk of interest * @index: index of the metadata block * * Scans over the block beginning at first_free and updates the block * metadata accordingly. */ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); unsigned int start, end; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { start = block->scan_hint_start + block->scan_hint; block->contig_hint_start = block->scan_hint_start; block->contig_hint = block->scan_hint; block->scan_hint = 0; } else { start = block->first_free; block->contig_hint = 0; } block->right_free = 0; /* iterate over free areas and update the contig hints */ for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) pcpu_block_update(block, start, end); } /** * pcpu_block_update_hint_alloc - update hint on allocation path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. The metadata only has to be * refreshed by a full scan iff the chunk's contig hint is broken. Block level * scans are required if the block's contig hint is broken. */ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Update s_block. */ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * block->first_free must be updated if the allocation takes its place. * If the allocation breaks the contig_hint, a scan is required to * restore this hint. */ if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), PCPU_BITMAP_BLOCK_BITS, s_off + bits); if (pcpu_region_overlap(s_block->scan_hint_start, s_block->scan_hint_start + s_block->scan_hint, s_off, s_off + bits)) s_block->scan_hint = 0; if (pcpu_region_overlap(s_block->contig_hint_start, s_block->contig_hint_start + s_block->contig_hint, s_off, s_off + bits)) { /* block contig hint is broken - scan to fix it */ if (!s_off) s_block->left_free = 0; pcpu_block_refresh_hint(chunk, s_index); } else { /* update left and right contig manually */ s_block->left_free = min(s_block->left_free, s_off); if (s_index == e_index) s_block->right_free = min_t(int, s_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); else s_block->right_free = 0; } /* * Update e_block. */ if (s_index != e_index) { if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * When the allocation is across blocks, the end is along * the left part of the e_block. */ e_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, e_off); if (e_off == PCPU_BITMAP_BLOCK_BITS) { /* reset the block */ e_block++; } else { if (e_off > e_block->scan_hint_start) e_block->scan_hint = 0; e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, e_index); } else { e_block->right_free = min_t(int, e_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); } } /* update in-between md_blocks */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->scan_hint = 0; block->contig_hint = 0; block->left_free = 0; block->right_free = 0; } } /* * If the allocation is not atomic, some blocks may not be * populated with pages, while we account it here. The number * of pages will be added back with pcpu_chunk_populated() * when populating pages. */ if (nr_empty_pages) pcpu_update_empty_pages(chunk, -nr_empty_pages); if (pcpu_region_overlap(chunk_md->scan_hint_start, chunk_md->scan_hint_start + chunk_md->scan_hint, bit_off, bit_off + bits)) chunk_md->scan_hint = 0; /* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct. */ if (pcpu_region_overlap(chunk_md->contig_hint_start, chunk_md->contig_hint_start + chunk_md->contig_hint, bit_off, bit_off + bits)) pcpu_chunk_refresh_hint(chunk, false); } /** * pcpu_block_update_hint_free - updates the block hints on the free path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. This avoids a blind block * refresh by making use of the block contig hints. If this fails, it scans * forward and backward to determine the extent of the free area. This is * capped at the boundary of blocks. * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating * over the block metadata to update chunk_md->contig_hint. * chunk_md->contig_hint may be off by up to a page, but it will never be more * than the available space. If the contig hint is contained in one block, it * will be accurate. */ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) { int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ int start, end; /* start and end of the whole free area */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Check if the freed area aligns with the block->contig_hint. * If it does, then the scan to find the beginning/end of the * larger free area can be avoided. * * start and end refer to beginning and end of the free area * within each their respective blocks. This is not necessarily * the entire free area as it may span blocks past the beginning * or end of the block. */ start = s_off; if (s_off == s_block->contig_hint + s_block->contig_hint_start) { start = s_block->contig_hint_start; } else { /* * Scan backwards to find the extent of the free area. * find_last_bit returns the starting bit, so if the start bit * is returned, that means there was no last bit and the * remainder of the chunk is free. */ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), start); start = (start == l_bit) ? 0 : l_bit + 1; } end = e_off; if (e_off == e_block->contig_hint_start) end = e_block->contig_hint_start + e_block->contig_hint; else end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, end); /* update s_block */ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(s_block, start, e_off); /* freeing in the same block */ if (s_index != e_index) { /* update e_block */ if (end == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(e_block, 0, end); /* reset md_blocks in the middle */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->first_free = 0; block->scan_hint = 0; block->contig_hint_start = 0; block->contig_hint = PCPU_BITMAP_BLOCK_BITS; block->left_free = PCPU_BITMAP_BLOCK_BITS; block->right_free = PCPU_BITMAP_BLOCK_BITS; } } if (nr_empty_pages) pcpu_update_empty_pages(chunk, nr_empty_pages); /* * Refresh chunk metadata when the free makes a block free or spans * across blocks. The contig_hint may be off by up to a page, but if * the contig_hint is contained in a block, it will be accurate with * the else condition below. */ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) pcpu_chunk_refresh_hint(chunk, true); else pcpu_block_update(&chunk->chunk_md, pcpu_block_off_to_off(s_index, start), end); } /** * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of area * @next_off: return value for the next offset to start searching * * For atomic allocations, check if the backing pages are populated. * * RETURNS: * Bool if the backing pages are populated. * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. */ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { unsigned int start, end; start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); start = find_next_zero_bit(chunk->populated, end, start); if (start >= end) return true; end = find_next_bit(chunk->populated, end, start + 1); *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } /** * pcpu_find_block_fit - finds the block index to start searching * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE bytes) * @pop_only: use populated regions only * * Given a chunk and an allocation spec, find the offset to begin searching * for a free region. This iterates over the bitmap metadata blocks to * find an offset that will be guaranteed to fit the requirements. It is * not quite first fit as if the allocation does not fit in the contig hint * of a block or chunk, it is skipped. This errs on the side of caution * to prevent excess iteration. Poor alignment can cause the allocator to * skip over blocks and chunks that have valid free areas. * * RETURNS: * The offset in the bitmap to begin searching. * -1 if no offset is found. */ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, size_t align, bool pop_only) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off; /* * This is an optimization to prevent scanning by assuming if the * allocation cannot fit in the global hint, there is memory pressure * and creating a new chunk would happen soon. */ if (!pcpu_check_block_hint(chunk_md, alloc_bits, align)) return -1; bit_off = pcpu_next_hint(chunk_md, alloc_bits); bits = 0; pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, &next_off)) break; bit_off = next_off; bits = 0; } if (bit_off == pcpu_chunk_map_bits(chunk)) return -1; return bit_off; } /* * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() * @map: the address to base the search on * @size: the bitmap size in bits * @start: the bitnumber to start searching at * @nr: the number of zeroed bits we're looking for * @align_mask: alignment mask for zero area * @largest_off: offset of the largest area skipped * @largest_bits: size of the largest area skipped * * The @align_mask should be one less than a power of 2. * * This is a modified version of bitmap_find_next_zero_area_off() to remember * the largest area that was skipped. This is imperfect, but in general is * good enough. The largest remembered region is the largest failed region * seen. This does not include anything we possibly skipped due to alignment. * pcpu_block_update_scan() does scan backwards to try and recover what was * lost to alignment. While this can cause scanning to miss earlier possible * free areas, smaller allocations will eventually fill those holes. */ static unsigned long pcpu_find_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned long nr, unsigned long align_mask, unsigned long *largest_off, unsigned long *largest_bits) { unsigned long index, end, i, area_off, area_bits; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index, align_mask); area_off = index; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { area_bits = i - area_off; /* remember largest unused area with best alignment */ if (area_bits > *largest_bits || (area_bits == *largest_bits && *largest_off && (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { *largest_off = area_off; *largest_bits = area_bits; } start = i + 1; goto again; } return index; } /** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE) * @start: bit_off to start searching * * This function takes in a @start offset to begin searching to fit an * allocation of @alloc_bits with alignment @align. It needs to scan * the allocation map because if it fits within the block's contig hint, * @start will be block->first_free. This is an attempt to fill the * allocation prior to breaking the contig hint. The allocation and * boundary maps are updated accordingly if it confirms a valid * free area. * * RETURNS: * Allocated addr offset in @chunk on success. * -1 if no matching area is found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, size_t align, int start) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; size_t align_mask = (align) ? (align - 1) : 0; unsigned long area_off = 0, area_bits = 0; int bit_off, end, oslot; lockdep_assert_held(&pcpu_lock); oslot = pcpu_chunk_slot(chunk); /* * Search to find a fit. */ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, pcpu_chunk_map_bits(chunk)); bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, align_mask, &area_off, &area_bits); if (bit_off >= end) return -1; if (area_bits) pcpu_block_update_scan(chunk, area_off, area_bits); /* update alloc map */ bitmap_set(chunk->alloc_map, bit_off, alloc_bits); /* update boundary map */ set_bit(bit_off, chunk->bound_map); bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); set_bit(bit_off + alloc_bits, chunk->bound_map); chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ if (bit_off == chunk_md->first_free) chunk_md->first_free = find_next_zero_bit( chunk->alloc_map, pcpu_chunk_map_bits(chunk), bit_off + alloc_bits); pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); pcpu_chunk_relocate(chunk, oslot); return bit_off * PCPU_MIN_ALLOC_SIZE; } /** * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest * @off: addr offset into chunk * * This function determines the size of an allocation to free using * the boundary bitmap and clears the allocation map. * * RETURNS: * Number of freed bytes. */ static int pcpu_free_area(struct pcpu_chunk *chunk, int off) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot, freed; lockdep_assert_held(&pcpu_lock); pcpu_stats_area_dealloc(chunk); oslot = pcpu_chunk_slot(chunk); bit_off = off / PCPU_MIN_ALLOC_SIZE; /* find end index */ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), bit_off + 1); bits = end - bit_off; bitmap_clear(chunk->alloc_map, bit_off, bits); freed = bits * PCPU_MIN_ALLOC_SIZE; /* update metadata */ chunk->free_bytes += freed; /* update first free bit */ chunk_md->first_free = min(chunk_md->first_free, bit_off); pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); return freed; } static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) { block->scan_hint = 0; block->contig_hint = nr_bits; block->left_free = nr_bits; block->right_free = nr_bits; block->first_free = 0; block->nr_bits = nr_bits; } static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) { struct pcpu_block_md *md_block; /* init the chunk's block */ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); md_block++) pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); } /** * pcpu_alloc_first_chunk - creates chunks that serve the first chunk * @tmp_addr: the start of the region served * @map_size: size of the region served * * This is responsible for creating the chunks that serve the first chunk. The * base_addr is page aligned down of @tmp_addr while the region end is page * aligned up. Offsets are kept track of to determine the region served. All * this is done to appease the bitmap allocator in avoiding partial blocks. * * RETURNS: * Chunk serving the region at @tmp_addr of @map_size. */ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size) { struct pcpu_chunk *chunk; unsigned long aligned_addr; int start_offset, offset_bits, region_size, region_bits; size_t alloc_size; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; start_offset = tmp_addr - aligned_addr; region_size = ALIGN(start_offset + map_size, PAGE_SIZE); /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); INIT_LIST_HEAD(&chunk->list); chunk->base_addr = (void *)aligned_addr; chunk->start_offset = start_offset; chunk->end_offset = region_size - chunk->start_offset - map_size; chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->alloc_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->bound_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->md_blocks) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); #ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ chunk->obj_exts = NULL; #endif pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; chunk->nr_empty_pop_pages = chunk->nr_pages; chunk->free_bytes = map_size; if (chunk->start_offset) { /* hide the beginning of the bitmap */ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, 0, offset_bits); set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); chunk->chunk_md.first_free = offset_bits; pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } if (chunk->end_offset) { /* hide the end of the bitmap */ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, chunk->bound_map); set_bit(region_bits, chunk->bound_map); pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); } return chunk; } static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); if (!chunk) return NULL; INIT_LIST_HEAD(&chunk->list); chunk->nr_pages = pcpu_unit_pages; region_bits = pcpu_chunk_map_bits(chunk); chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), gfp); if (!chunk->alloc_map) goto alloc_map_fail; chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), gfp); if (!chunk->bound_map) goto bound_map_fail; chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), gfp); if (!chunk->md_blocks) goto md_blocks_fail; #ifdef NEED_PCPUOBJ_EXT if (need_pcpuobj_ext()) { chunk->obj_exts = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * sizeof(struct pcpuobj_ext), gfp); if (!chunk->obj_exts) goto objcg_fail; } #endif pcpu_init_md_blocks(chunk); /* init metadata */ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; #ifdef NEED_PCPUOBJ_EXT objcg_fail: pcpu_mem_free(chunk->md_blocks); #endif md_blocks_fail: pcpu_mem_free(chunk->bound_map); bound_map_fail: pcpu_mem_free(chunk->alloc_map); alloc_map_fail: pcpu_mem_free(chunk); return NULL; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; #ifdef NEED_PCPUOBJ_EXT pcpu_mem_free(chunk->obj_exts); #endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); } /** * pcpu_chunk_populated - post-population bookkeeping * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; pcpu_nr_populated += nr; pcpu_update_empty_pages(chunk, nr); } /** * pcpu_chunk_depopulated - post-depopulation bookkeeping * @chunk: pcpu_chunk which got depopulated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been depopulated from @chunk. * Update the bookkeeping information accordingly. Must be called after * each successful depopulation. */ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; pcpu_nr_populated -= nr; pcpu_update_empty_pages(chunk, -nr); } /* * Chunk management implementation. * * To allow different implementations, chunk alloc/free and * [de]population are implemented in a separate file which is pulled * into this file and compiled together. The following functions * should be implemented. * * pcpu_populate_chunk - populate the specified range of a chunk * pcpu_depopulate_chunk - depopulate the specified range of a chunk * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk * pcpu_create_chunk - create a new chunk * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); #ifdef CONFIG_NEED_PER_CPU_KM #include "percpu-km.c" #else #include "percpu-vm.c" #endif /** * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * * This is an internal function that handles all but static allocations. * Static percpu address values should never be passed into the allocator. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { /* is it in the dynamic region (first chunk)? */ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk; /* is it in the reserved region? */ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) return pcpu_reserved_chunk; /* * The address is relative to unit0 which might be unused and * thus unmapped. Offset the address to the unit space of the * current processor before looking it up in the vmalloc * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } #ifdef CONFIG_MEMCG static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { struct obj_cgroup *objcg; if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; objcg = current_obj_cgroup(); if (!objcg) return true; if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) return false; *objcgp = objcg; return true; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { if (!objcg) return; if (likely(chunk && chunk->obj_exts)) { obj_cgroup_get(objcg); chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg; rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, pcpu_obj_full_size(size)); rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); } } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { struct obj_cgroup *objcg; if (unlikely(!chunk->obj_exts)) return; objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup; if (!objcg) return; chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL; obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); } #else /* CONFIG_MEMCG */ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { return true; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MEM_ALLOC_PROFILING static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, size_t size) { if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) { alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, current->alloc_tag, size); } } static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size); } #else static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, size_t size) { } static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } #endif /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * @gfp: allocation flags * * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN * then no warning will be triggered on invalid or failed allocation * requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp_t gfp) { gfp_t pcpu_gfp; bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; static int warn_limit = 10; struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; unsigned long flags; void __percpu *ptr; size_t bits, bit_align; gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; do_warn = !(gfp & __GFP_NOWARN); /* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. * An allocation may have internal fragmentation from rounding up * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. */ if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) align = PCPU_MIN_ALLOC_SIZE; size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); bits = size >> PCPU_MIN_ALLOC_SHIFT; bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n", size, align); return NULL; } if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) return NULL; if (!is_atomic) { /* * pcpu_balance_workfn() allocates memory under this mutex, * and it may wait for memory reclaim. Allow current task * to become OOM victim, in case of memory pressure. */ if (gfp & __GFP_NOFAIL) { mutex_lock(&pcpu_alloc_mutex); } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } } spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { err = "alloc from reserved chunk failed"; goto fail_unlock; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; err = "alloc from reserved chunk failed"; goto fail_unlock; } restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) { list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { if (slot < PCPU_SLOT_FAIL_THRESHOLD) pcpu_chunk_move(chunk, 0); continue; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) { pcpu_reintegrate_chunk(chunk); goto area_found; } } } spin_unlock_irqrestore(&pcpu_lock, flags); if (is_atomic) { err = "atomic alloc failed, no space left"; goto fail; } /* No space left. Create a new chunk. */ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) { chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); } else { spin_lock_irqsave(&pcpu_lock, flags); } goto restart; area_found: pcpu_stats_area_alloc(chunk, size); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ if (!is_atomic) { unsigned int page_end, rs, re; rs = PFN_DOWN(off); page_end = PFN_UP(off + size); for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size, gfp); trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align, chunk->base_addr, off, ptr, pcpu_obj_full_size(size), gfp); pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); pcpu_alloc_tag_alloc_hook(chunk, off, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); if (do_warn && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); if (!is_atomic) dump_stack(); if (!--warn_limit) pr_info("limit reached, disable warning\n"); } if (is_atomic) { /* see the flag handling in pcpu_balance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); } else { mutex_unlock(&pcpu_alloc_mutex); } pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } EXPORT_SYMBOL_GPL(pcpu_alloc_noprof); /** * pcpu_balance_free - manage the amount of free chunks * @empty_only: free chunks only if there are no populated pages * * If empty_only is %false, reclaim all fully free chunks regardless of the * number of populated pages. Otherwise, only reclaim chunks that have no * populated pages. * * CONTEXT: * pcpu_lock (can be dropped temporarily) */ static void pcpu_balance_free(bool empty_only) { LIST_HEAD(to_free); struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot]; struct pcpu_chunk *chunk, *next; lockdep_assert_held(&pcpu_lock); /* * There's no reason to keep around multiple unused chunks and VM * areas can be scarce. Destroy all free chunks except for one. */ list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; if (!empty_only || chunk->nr_empty_pop_pages == 0) list_move(&chunk->list, &to_free); } if (list_empty(&to_free)) return; spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &to_free, list) { unsigned int rs, re; for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); cond_resched(); } spin_lock_irq(&pcpu_lock); } /** * pcpu_balance_populated - manage the amount of populated pages * * Maintain a certain amount of populated pages to satisfy atomic allocations. * It is possible that this is called when physical memory is scarce causing * OOM killer to be triggered. We should avoid doing so until an actual * allocation causes the failure as it is possible that requests can be * serviced from already backed regions. * * CONTEXT: * pcpu_lock (can be dropped temporarily) */ static void pcpu_balance_populated(void) { /* gfp flags passed to underlying allocators */ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; struct pcpu_chunk *chunk; int slot, nr_to_pop, ret; lockdep_assert_held(&pcpu_lock); /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic * allocs don't increase fragmentation. If atomic allocation * failed previously, always populate the maximum amount. This * should prevent atomic allocs larger than PAGE_SIZE from keeping * failing indefinitely; however, large atomic allocs are not * something we support properly and can be highly unreliable and * inefficient. */ retry_pop: if (pcpu_atomic_alloc_failed) { nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; /* best effort anyway, don't worry about synchronization */ pcpu_atomic_alloc_failed = false; } else { nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - pcpu_nr_empty_pop_pages, 0, PCPU_EMPTY_POP_PAGES_HIGH); } for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) { unsigned int nr_unpop = 0, rs, re; if (!nr_to_pop) break; list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) { nr_unpop = chunk->nr_pages - chunk->nr_populated; if (nr_unpop) break; } if (!nr_unpop) continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop); spin_unlock_irq(&pcpu_lock); ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); cond_resched(); spin_lock_irq(&pcpu_lock); if (!ret) { nr_to_pop -= nr; pcpu_chunk_populated(chunk, rs, rs + nr); } else { nr_to_pop = 0; } if (!nr_to_pop) break; } } if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */ spin_unlock_irq(&pcpu_lock); chunk = pcpu_create_chunk(gfp); cond_resched(); spin_lock_irq(&pcpu_lock); if (chunk) { pcpu_chunk_relocate(chunk, -1); goto retry_pop; } } } /** * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages * * Scan over chunks in the depopulate list and try to release unused populated * pages back to the system. Depopulated chunks are sidelined to prevent * repopulating these pages unless required. Fully free chunks are reintegrated * and freed accordingly (1 is kept around). If we drop below the empty * populated pages threshold, reintegrate the chunk if it has empty free pages. * Each chunk is scanned in the reverse order to keep populated pages close to * the beginning of the chunk. * * CONTEXT: * pcpu_lock (can be dropped temporarily) * */ static void pcpu_reclaim_populated(void) { struct pcpu_chunk *chunk; struct pcpu_block_md *block; int freed_page_start, freed_page_end; int i, end; bool reintegrate; lockdep_assert_held(&pcpu_lock); /* * Once a chunk is isolated to the to_depopulate list, the chunk is no * longer discoverable to allocations whom may populate pages. The only * other accessor is the free path which only returns area back to the * allocator not touching the populated bitmap. */ while ((chunk = list_first_entry_or_null( &pcpu_chunk_lists[pcpu_to_depopulate_slot], struct pcpu_chunk, list))) { WARN_ON(chunk->immutable); /* * Scan chunk's pages in the reverse order to keep populated * pages close to the beginning of the chunk. */ freed_page_start = chunk->nr_pages; freed_page_end = 0; reintegrate = false; for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { /* no more work to do */ if (chunk->nr_empty_pop_pages == 0) break; /* reintegrate chunk to prevent atomic alloc failures */ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { reintegrate = true; break; } /* * If the page is empty and populated, start or * extend the (i, end) range. If i == 0, decrease * i and perform the depopulation to cover the last * (first) page in the chunk. */ block = chunk->md_blocks + i; if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS && test_bit(i, chunk->populated)) { if (end == -1) end = i; if (i > 0) continue; i--; } /* depopulate if there is an active range */ if (end == -1) continue; spin_unlock_irq(&pcpu_lock); pcpu_depopulate_chunk(chunk, i + 1, end + 1); cond_resched(); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, i + 1, end + 1); freed_page_start = min(freed_page_start, i + 1); freed_page_end = max(freed_page_end, end + 1); /* reset the range and continue */ end = -1; } /* batch tlb flush per chunk to amortize cost */ if (freed_page_start < freed_page_end) { spin_unlock_irq(&pcpu_lock); pcpu_post_unmap_tlb_flush(chunk, freed_page_start, freed_page_end); cond_resched(); spin_lock_irq(&pcpu_lock); } if (reintegrate || chunk->free_bytes == pcpu_unit_size) pcpu_reintegrate_chunk(chunk); else list_move_tail(&chunk->list, &pcpu_chunk_lists[pcpu_sidelined_slot]); } } /** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * For each chunk type, manage the number of fully free chunks and the number of * populated pages. An important thing to consider is when pages are freed and * how they contribute to the global counts. */ static void pcpu_balance_workfn(struct work_struct *work) { /* * pcpu_balance_free() is called twice because the first time we may * trim pages in the active pcpu_nr_empty_pop_pages which may cause us * to grow other chunks. This then gives pcpu_reclaim_populated() time * to move fully free chunks to the active list to be freed if * appropriate. */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); pcpu_balance_free(false); pcpu_reclaim_populated(); pcpu_balance_populated(); pcpu_balance_free(true); spin_unlock_irq(&pcpu_lock); mutex_unlock(&pcpu_alloc_mutex); } /** * pcpu_alloc_size - the size of the dynamic percpu area * @ptr: pointer to the dynamic percpu area * * Returns the size of the @ptr allocation. This is undefined for statically * defined percpu variables as there is no corresponding chunk->bound_map. * * RETURNS: * The size of the dynamic percpu area. * * CONTEXT: * Can be called from atomic context. */ size_t pcpu_alloc_size(void __percpu *ptr) { struct pcpu_chunk *chunk; unsigned long bit_off, end; void *addr; if (!ptr) return 0; addr = __pcpu_ptr_to_addr(ptr); /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */ chunk = pcpu_chunk_addr_search(addr); bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE; end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), bit_off + 1); return (end - bit_off) * PCPU_MIN_ALLOC_SIZE; } /** * free_percpu - free percpu area * @ptr: pointer to area to free * * Free percpu area @ptr. * * CONTEXT: * Can be called from atomic context. */ void free_percpu(void __percpu *ptr) { void *addr; struct pcpu_chunk *chunk; unsigned long flags; int size, off; bool need_balance = false; if (!ptr) return; kmemleak_free_percpu(ptr); addr = __pcpu_ptr_to_addr(ptr); chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; spin_lock_irqsave(&pcpu_lock, flags); size = pcpu_free_area(chunk, off); pcpu_alloc_tag_free_hook(chunk, off, size); pcpu_memcg_free_hook(chunk, off, size); /* * If there are more than one fully free chunks, wake up grim reaper. * If the chunk is isolated, it may be in the process of being * reclaimed. Let reclaim manage cleaning up of that chunk. */ if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list) if (pos != chunk) { need_balance = true; break; } } else if (pcpu_should_reclaim_chunk(chunk)) { pcpu_isolate_chunk(chunk); need_balance = true; } trace_percpu_free_percpu(chunk->base_addr, off, ptr); spin_unlock_irqrestore(&pcpu_lock, flags); if (need_balance) pcpu_schedule_balance_work(); } EXPORT_SYMBOL_GPL(free_percpu); bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) { #ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); unsigned int cpu; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); void *va = (void *)addr; if (va >= start && va < start + static_size) { if (can_addr) { *can_addr = (unsigned long) (va - start); *can_addr += (unsigned long) per_cpu_ptr(base, get_boot_cpu_id()); } return true; } } #endif /* on UP, can't distinguish from other static vars, always false */ return false; } /** * is_kernel_percpu_address - test whether address is from static percpu area * @addr: address to test * * Test whether @addr belongs to in-kernel static percpu area. Module * static percpu areas are not considered. For those, use * is_module_percpu_address(). * * RETURNS: * %true if @addr is from in-kernel static percpu area, %false otherwise. */ bool is_kernel_percpu_address(unsigned long addr) { return __is_kernel_percpu_address(addr, NULL); } /** * per_cpu_ptr_to_phys - convert translated percpu address to physical address * @addr: the address to be converted to physical address * * Given @addr which is dereferenceable address obtained via one of * percpu access macros, this function translates it into its physical * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * * percpu allocator has special setup for the first chunk, which currently * supports either embedding in linear address space or vmalloc mapping, * and, from the second one, the backing allocator (currently either vm or * km) provides translation. * * The addr can be translated simply without checking if it falls into the * first chunk. But the current code reflects better how percpu allocator * actually works, and the verification can discover both bugs in percpu * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current * code. * * RETURNS: * The physical address for @addr. */ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; unsigned long first_low, first_high; unsigned int cpu; /* * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. * * The address check is against full chunk sizes. pcpu_base_addr * points to the beginning of the first chunk including the * static region. Assumes good intent as the first chunk may * not be full (ie. < pcpu_unit_pages in size). */ first_low = (unsigned long)pcpu_base_addr + pcpu_unit_page_offset(pcpu_low_unit_cpu, 0); first_high = (unsigned long)pcpu_base_addr + pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsigned long)addr >= first_low && (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); if (addr >= start && addr < start + pcpu_unit_size) { in_first_chunk = true; break; } } } if (in_first_chunk) { if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)) + offset_in_page(addr); } else return page_to_phys(pcpu_addr_to_page(addr)) + offset_in_page(addr); } /** * pcpu_alloc_alloc_info - allocate percpu allocation info * @nr_groups: the number of groups * @nr_units: the number of units * * Allocate ai which is large enough for @nr_groups groups containing * @nr_units units. The returned ai's groups[0].cpu_map points to the * cpu_map array which is long enough for @nr_units and filled with * NR_CPUS. It's the caller's responsibility to initialize cpu_map * pointer of other groups. * * RETURNS: * Pointer to the allocated pcpu_alloc_info on success, NULL on * failure. */ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units) { struct pcpu_alloc_info *ai; size_t base_size, ai_size; void *ptr; int unit; base_size = ALIGN(struct_size(ai, groups, nr_groups), __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; ptr += base_size; ai->groups[0].cpu_map = ptr; for (unit = 0; unit < nr_units; unit++) ai->groups[0].cpu_map[unit] = NR_CPUS; ai->nr_groups = nr_groups; ai->__ai_size = PFN_ALIGN(ai_size); return ai; } /** * pcpu_free_alloc_info - free percpu allocation info * @ai: pcpu_alloc_info to free * * Free @ai which was allocated by pcpu_alloc_alloc_info(). */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { memblock_free(ai, ai->__ai_size); } /** * pcpu_dump_alloc_info - print out information about pcpu_alloc_info * @lvl: loglevel * @ai: allocation info to dump * * Print out information about @ai using loglevel @lvl. */ static void pcpu_dump_alloc_info(const char *lvl, const struct pcpu_alloc_info *ai) { int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; int alloc = 0, alloc_end = 0; int group, v; int upa, apl; /* units per alloc, allocs per line */ v = ai->nr_groups; while (v /= 10) group_width++; v = num_possible_cpus(); while (v /= 10) cpu_width++; empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; upa = ai->alloc_size / ai->unit_size; width = upa * (cpu_width + 1) + group_width + 3; apl = rounddown_pow_of_two(max(60 / width, 1)); printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", lvl, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); for (group = 0; group < ai->nr_groups; group++) { const struct pcpu_group_info *gi = &ai->groups[group]; int unit = 0, unit_end = 0; BUG_ON(gi->nr_units % upa); for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { pr_cont("\n"); printk("%spcpu-alloc: ", lvl); } pr_cont("[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) pr_cont("%0*d ", cpu_width, gi->cpu_map[unit]); else pr_cont("%s ", empty_str); } } pr_cont("\n"); } /** * pcpu_setup_first_chunk - initialize the first percpu chunk * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * percpu area. This function is to be called from arch percpu area * setup path. * * @ai contains all information necessary to initialize the first * chunk and prime the dynamic percpu allocator. * * @ai->static_size is the size of static percpu area. * * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu * static areas on architectures where the addressing model has * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * * @ai->dyn_size determines the number of bytes available for dynamic * allocation in the first chunk. The area between @ai->static_size + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE * and equal to or larger than @ai->static_size + @ai->reserved_size + * @ai->dyn_size. * * @ai->atom_size is the allocation atom size and used as alignment * for vm areas. * * @ai->alloc_size is the allocation size and always multiple of * @ai->atom_size. This is larger than @ai->atom_size if * @ai->unit_size is larger than @ai->atom_size. * * @ai->nr_groups and @ai->groups describe virtual memory layout of * percpu areas. Units which should be colocated are put into the * same group. Dynamic VM areas will be allocated according to these * groupings. If @ai->nr_groups is zero, a single group containing * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * * The first chunk will always contain a static and a dynamic region. * However, the static region is not managed by any chunk. If the first * chunk also contains a reserved region, it is served by two chunks - * one for the reserved region and one for the dynamic region. They * share the same vm, but use offset regions in the area allocation map. * The chunk serving the dynamic region is circulated in the chunk slots * and available for dynamic allocation like any other chunk. */ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; unsigned long tmp_addr; size_t alloc_size; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ pr_emerg("failed to initialize, %s\n", #cond); \ pr_emerg("cpu_possible_mask=%*pb\n", \ cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ } \ } while (0) /* sanity checks */ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); #endif PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!group_offsets) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!group_sizes) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!unit_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!unit_off) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; pcpu_low_unit_cpu = NR_CPUS; pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; group_offsets[group] = gi->base_offset; group_sizes[group] = gi->nr_units * ai->unit_size; for (i = 0; i < gi->nr_units; i++) { cpu = gi->cpu_map[i]; if (cpu == NR_CPUS) continue; PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids); PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; /* determine low/high unit_cpu */ if (pcpu_low_unit_cpu == NR_CPUS || unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) pcpu_low_unit_cpu = cpu; if (pcpu_high_unit_cpu == NR_CPUS || unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; for_each_possible_cpu(cpu) PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); /* we're done parsing the input, undefine BUG macro and dump config */ #undef PCPU_SETUP_BUG_ON pcpu_dump_alloc_info(KERN_DEBUG, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; pcpu_group_sizes = group_sizes; pcpu_unit_map = unit_map; pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated, BITS_TO_LONGS(pcpu_unit_pages)); pcpu_stats_save_ai(ai); /* * Allocate chunk slots. The slots after the active slots are: * sidelined_slot - isolated, depopulated chunks * free_slot - fully free chunks * to_depopulate_slot - isolated, chunks to depopulate */ pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; pcpu_free_slot = pcpu_sidelined_slot + 1; pcpu_to_depopulate_slot = pcpu_free_slot + 1; pcpu_nr_slots = pcpu_to_depopulate_slot + 1; pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]), SMP_CACHE_BYTES); if (!pcpu_chunk_lists) panic("%s: Failed to allocate %zu bytes\n", __func__, pcpu_nr_slots * sizeof(pcpu_chunk_lists[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_lists[i]); /* * The end of the static region needs to be aligned with the * minimum allocation size as this offsets the reserved and * dynamic region. The first chunk ends page aligned by * expanding the dynamic region, therefore the dynamic region * can be shrunk to compensate while still staying above the * configured sizes. */ static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE); dyn_size = ai->dyn_size - (static_size - ai->static_size); /* * Initialize first chunk: * This chunk is broken up into 3 parts: * < static | [reserved] | dynamic > * - static - there is no backing chunk because these allocations can * never be freed. * - reserved (pcpu_reserved_chunk) - exists primarily to serve * allocations from module load. * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first * chunk. */ tmp_addr = (unsigned long)base_addr + static_size; if (ai->reserved_size) pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr, ai->reserved_size); tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size); pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* include all regions of the first chunk */ pcpu_nr_populated += PFN_DOWN(size_sum); pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); /* we're done */ pcpu_base_addr = base_addr; } #ifdef CONFIG_SMP const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; static int __init percpu_alloc_setup(char *str) { if (!str) return -EINVAL; if (0) /* nada */; #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK else if (!strcmp(str, "embed")) pcpu_chosen_fc = PCPU_FC_EMBED; #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else pr_warn("unknown allocator %s specified\n", str); return 0; } early_param("percpu_alloc", percpu_alloc_setup); /* * pcpu_embed_first_chunk() is used by the generic percpu setup. * Build it if needed by the arch config or the generic setup is going * to be used. */ #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) #define BUILD_EMBED_FIRST_CHUNK #endif /* build pcpu_page_first_chunk() iff needed by the arch config */ #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #define BUILD_PAGE_FIRST_CHUNK #endif /* pcpu_build_alloc_info() is used by both embed and page first chunk */ #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) /** * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * * This function determines grouping of units, their mappings to cpus * and other parameters considering needed percpu size, allocation * atom size and distances between CPUs. * * Groups are always multiples of atom size and CPUs which are of * LOCAL_DISTANCE both ways are grouped together and share space for * units in the same group. The returned configuration is guaranteed * to have CPUs on different nodes on different groups and >=75% usage * of allocated virtual address space. * * RETURNS: * On success, pointer to the new allocation_info is returned. On * failure, ERR_PTR value is returned. */ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; static struct cpumask mask __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, best_upa; /* units_per_alloc */ int last_allocs, group, unit; unsigned int cpu, tcpu; struct pcpu_alloc_info *ai; unsigned int *cpu_map; /* this function may be called multiple times */ memset(group_map, 0, sizeof(group_map)); memset(group_cnt, 0, sizeof(group_cnt)); cpumask_clear(&mask); /* calculate size_sum and ensure dyn_size is enough for early alloc */ size_sum = PFN_ALIGN(static_size + reserved_size + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); dyn_size = size_sum - static_size - reserved_size; /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest * which can accommodate 4k aligned segments which are equal to * or larger than min_unit_size. */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); /* determine the maximum # of units that can fit in an allocation */ alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || (offset_in_page(alloc_size / upa))) upa--; max_upa = upa; cpumask_copy(&mask, cpu_possible_mask); /* group cpus according to their proximity */ for (group = 0; !cpumask_empty(&mask); group++) { /* pop the group's first cpu */ cpu = cpumask_first(&mask); group_map[cpu] = group; group_cnt[group]++; cpumask_clear_cpu(cpu, &mask); for_each_cpu(tcpu, &mask) { if (!cpu_distance_fn || (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE && cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) { group_map[tcpu] = group; group_cnt[group]++; cpumask_clear_cpu(tcpu, &mask); } } } nr_groups = group; /* * Wasted space is caused by a ratio imbalance of upa to group_cnt. * Expand the unit_size until we use >= 75% of the units allocated. * Related to atom_size, which could be much larger than the unit_size. */ last_allocs = INT_MAX; best_upa = 0; for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0; if (alloc_size % upa || (offset_in_page(alloc_size / upa))) continue; for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; } /* * Don't accept if wastage is over 1/3. The * greater-than comparison ensures upa==1 always * passes the following check. */ if (wasted > num_possible_cpus() / 3) continue; /* and then don't consume more memory */ if (allocs > last_allocs) break; last_allocs = allocs; best_upa = upa; } BUG_ON(!best_upa); upa = best_upa; /* allocate and fill alloc_info */ for (group = 0; group < nr_groups; group++) nr_units += roundup(group_cnt[group], upa); ai = pcpu_alloc_alloc_info(nr_groups, nr_units); if (!ai) return ERR_PTR(-ENOMEM); cpu_map = ai->groups[0].cpu_map; for (group = 0; group < nr_groups; group++) { ai->groups[group].cpu_map = cpu_map; cpu_map += roundup(group_cnt[group], upa); } ai->static_size = static_size; ai->reserved_size = reserved_size; ai->dyn_size = dyn_size; ai->unit_size = alloc_size / upa; ai->atom_size = atom_size; ai->alloc_size = alloc_size; for (group = 0, unit = 0; group < nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; /* * Initialize base_offset as if all groups are located * back-to-back. The caller should update this to * reflect actual allocation. */ gi->base_offset = unit * ai->unit_size; for_each_possible_cpu(cpu) if (group_map[cpu] == group) gi->cpu_map[gi->nr_units++] = cpu; gi->nr_units = roundup(gi->nr_units, upa); unit += gi->nr_units; } BUG_ON(unit != nr_units); return ai; } static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA int node = NUMA_NO_NODE; void *ptr; if (cpu_to_nd_fn) node = cpu_to_nd_fn(cpu); if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) { ptr = memblock_alloc_from(size, align, goal); pr_info("cpu %d has no node %d or node-local memory\n", cpu, node); pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n", cpu, size, (u64)__pa(ptr)); } else { ptr = memblock_alloc_try_nid(size, align, goal, MEMBLOCK_ALLOC_ACCESSIBLE, node); pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n", cpu, size, node, (u64)__pa(ptr)); } return ptr; #else return memblock_alloc_from(size, align, goal); #endif } static void __init pcpu_fc_free(void *ptr, size_t size) { memblock_free(ptr, size); } #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ #if defined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated * by calling pcpu_fc_alloc and used as-is without being mapped into * vmalloc area. Allocations are always whole multiples of @atom_size * aligned to @atom_size. * * This enables the first chunk to piggy back on the linear physical * mapping which often uses larger page size. Please note that this * can result in very sparse cpu->unit mapping on NUMA machines thus * requiring large vmalloc address space. Don't use this allocator if * vmalloc space is not orders of magnitude larger than distances * between node memory addresses (ie. 32bit NUMA machines). * * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned using pcpu_fc_free. * * RETURNS: * 0 on success, -errno on failure. */ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; size_t size_sum, areas_size; unsigned long max_distance; int group, i, highest_group, rc = 0; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); areas = memblock_alloc(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; } /* allocate, copy and determine base address & max_distance */ highest_group = 0; for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; unsigned int cpu = NR_CPUS; void *ptr; for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) cpu = gi->cpu_map[i]; BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); if (!ptr) { rc = -ENOMEM; goto out_free_areas; } /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(ptr)); areas[group] = ptr; base = min(ptr, base); if (ptr > areas[highest_group]) highest_group = group; } max_distance = areas[highest_group] - base; max_distance += ai->unit_size * ai->groups[highest_group].nr_units; /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n", max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; goto out_free_areas; #endif } /* * Copy data and free unused parts. This should happen after all * allocations are complete; otherwise, we may end up with * overlapping groups. */ for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; void *ptr = areas[group]; for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { /* unused unit, free whole */ pcpu_fc_free(ptr, ai->unit_size); continue; } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } /* base address is now known, determine group base offsets */ for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; } pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); pcpu_setup_first_chunk(ai, base); goto out_free; out_free_areas: for (group = 0; group < ai->nr_groups; group++) if (areas[group]) pcpu_fc_free(areas[group], ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); if (areas) memblock_free(areas, areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK #include <asm/pgalloc.h> #ifndef P4D_TABLE_SIZE #define P4D_TABLE_SIZE PAGE_SIZE #endif #ifndef PUD_TABLE_SIZE #define PUD_TABLE_SIZE PAGE_SIZE #endif #ifndef PMD_TABLE_SIZE #define PMD_TABLE_SIZE PAGE_SIZE #endif #ifndef PTE_TABLE_SIZE #define PTE_TABLE_SIZE PAGE_SIZE #endif void __init __weak pcpu_populate_pte(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; pmd_t *pmd; if (pgd_none(*pgd)) { p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); if (!p4d) goto err_alloc; pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); if (!pud) goto err_alloc; p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); if (!pmd) goto err_alloc; pud_populate(&init_mm, pud, pmd); } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) { pte_t *new; new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); if (!new) goto err_alloc; pmd_populate_kernel(&init_mm, pmd, new); } return; err_alloc: panic("%s: Failed to allocate memory\n", __func__); } /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * * This is a helper to ease setting up page-remapped first percpu * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. * * RETURNS: * 0 on success, -errno on failure. */ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; int unit, i, j, rc = 0; int upa; int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); upa = ai->alloc_size/ai->unit_size; nr_g0_units = roundup(num_possible_cpus(), upa); if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) { pcpu_free_alloc_info(ai); return -EINVAL; } unit_pages = ai->unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); if (!pages) panic("%s: Failed to allocate %zu bytes\n", __func__, pages_size); /* allocate pages */ j = 0; for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { void *ptr; ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(ptr)); pages[j++] = virt_to_page(ptr); } } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (rc < 0) panic("failed to map percpu area, err=%d\n", rc); flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) pcpu_fc_free(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: memblock_free(pages, pages_size); pcpu_free_alloc_info(ai); return rc; } #endif /* BUILD_PAGE_FIRST_CHUNK */ #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA /* * Generic SMP percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is * important because many archs have addressing restrictions and might * fail if the percpu area is located far away from the previous * location. As an added bonus, in non-NUMA cases, embedding is * generally a good idea TLB-wise because percpu area can piggy back * on the physical linear memory mapping which uses large page * mappings on applicable archs. */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; int rc; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, NULL); if (rc < 0) panic("Failed to initialize percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ #else /* CONFIG_SMP */ /* * UP percpu area setup. * * UP always uses km-based percpu allocator with identity mapping. * Static percpu variables are indistinguishable from the usual static * variables and don't require any special preparation. */ void __init setup_per_cpu_areas(void) { const size_t unit_size = roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, PERCPU_DYNAMIC_RESERVE)); struct pcpu_alloc_info *ai; void *fc; ai = pcpu_alloc_alloc_info(1, 1); fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ kmemleak_ignore_phys(__pa(fc)); ai->dyn_size = unit_size; ai->unit_size = unit_size; ai->atom_size = unit_size; ai->alloc_size = unit_size; ai->groups[0].nr_units = 1; ai->groups[0].cpu_map[0] = 0; pcpu_setup_first_chunk(ai, fc); pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ /* * pcpu_nr_pages - calculate total number of populated backing pages * * This reflects the number of pages populated to back chunks. Metadata is * excluded in the number exposed in meminfo as the number of backing pages * scales with the number of cpus and can quickly outweigh the memory used for * metadata. It also keeps this calculation nice and simple. * * RETURNS: * Total number of populated backing pages in use by the allocator. */ unsigned long pcpu_nr_pages(void) { return pcpu_nr_populated * pcpu_nr_units; } /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. */ static int __init percpu_enable_async(void) { pcpu_async_enabled = true; return 0; } subsys_initcall(percpu_enable_async); |
| 1673 2 2 2 2 2 119 118 2 2 2 2 2 120 120 1668 1659 1619 884 883 885 884 883 883 792 792 87 6 16 83 78 16 80 91 4 796 795 797 794 4 4 4 4 964 1684 1642 1646 797 794 50 795 883 884 884 882 884 882 879 1 1 882 4 4 2 1 1 2 881 884 880 3 8 876 1172 1175 1178 1177 1028 1177 982 981 245 882 879 61 8 60 60 60 43 44 20 32 32 10 1 9 9 6 6 1 5 5 4 4 1 1 2 2 2 2 1599 1594 802 1599 1600 1600 33 34 881 879 883 881 880 9 885 795 795 795 795 794 796 795 797 795 795 794 795 4 797 789 793 23 23 23 14 10 928 193 793 1 1 1 1 1 1 1 1 7 15 14 12 12 6 15 15 2 15 11 23 23 23 15 6 11 15 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/hash.h> #include "kernfs-internal.h" static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() * will perform wakeups when releasing console_sem. Holding rename_lock * will introduce deadlock if the scheduler reads the kernfs_name in the * wakeup path. */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) static bool __kernfs_active(struct kernfs_node *kn) { return atomic_read(&kn->active) >= 0; } static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC return kn->flags & KERNFS_LOCKDEP; #else return false; #endif } static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) { if (!kn) return strscpy(buf, "(null)", buflen); return strscpy(buf, kn->parent ? kn->name : "/", buflen); } /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; while (to->parent && to != from) { depth++; to = to->parent; } return depth; } static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, struct kernfs_node *b) { size_t da, db; struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); if (ra != rb) return NULL; da = kernfs_depth(ra->kn, a); db = kernfs_depth(rb->kn, b); while (da > db) { a = a->parent; da--; } while (db > da) { b = b->parent; db--; } /* worst case b and a will be the same at root */ while (b != a) { b = b->parent; a = a->parent; } return a; } /** * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, * where kn_from is treated as root of the path. * @kn_from: kernfs node which should be treated as root for the path * @kn_to: kernfs node to which path is needed * @buf: buffer to copy the path into * @buflen: size of @buf * * We need to handle couple of scenarios here: * [1] when @kn_from is an ancestor of @kn_to at some level * kn_from: /n1/n2/n3 * kn_to: /n1/n2/n3/n4/n5 * result: /n4/n5 * * [2] when @kn_from is on a different hierarchy and we need to find common * ancestor between @kn_from and @kn_to. * kn_from: /n1/n2/n3/n4 * kn_to: /n1/n2/n5 * result: /../../n5 * OR * kn_from: /n1/n2/n3/n4/n5 [depth=5] * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * * [3] when @kn_to is %NULL result will be "(null)" * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen) { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; ssize_t copied; int i, j; if (!kn_to) return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); buf[0] = '\0'; for (i = 0; i < depth_from; i++) { copied = strscpy(buf + len, parent_str, buflen - len); if (copied < 0) return copied; len += copied; } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { for (kn = kn_to, j = 0; j < i; j++) kn = kn->parent; len += scnprintf(buf + len, buflen - len, "/%s", kn->name); } return len; } /** * kernfs_name - obtain the name of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * * Return: the resulting length of @buf. If @buf isn't long enough, * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { unsigned long flags; int ret; read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_name_locked(kn, buf, buflen); read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest * @buf: buffer to copy @to's path into * @buflen: size of @buf * * Builds @to's path relative to @from in @buf. @from and @to must * be on the same kernfs-root. If @from is not parent of @to, then a relative * path (which includes '..'s) as needed to reach from @from to @to is * returned. * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { unsigned long flags; int ret; read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_path_from_node_locked(to, from, buf, buflen); read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * pr_cont_kernfs_path - pr_cont path of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_path(struct kernfs_node *kn) { unsigned long flags; int sz; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { if (sz == -E2BIG) pr_cont("(name too long)"); else pr_cont("(error)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * kernfs_get_parent - determine the parent node and pin it * @kn: kernfs_node of interest * * Determines @kn's parent, pins and returns it. This function can be * called from any context. * * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; unsigned long flags; read_lock_irqsave(&kernfs_rename_lock, flags); parent = kn->parent; kernfs_get(parent); read_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } /** * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash; } static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn) { if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns < kn->ns) return -1; if (ns > kn->ns) return 1; return strcmp(name, kn->name); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { return kernfs_name_compare(left->hash, left->name, left->ns, right); } /** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * kernfs_rwsem held exclusive * * Return: * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; while (*node) { struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); /* successfully added, account subdir number */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; kernfs_inc_rev(kn->parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } /** * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from * kn->parent->dir.children. * * Return: %true if @kn was actually removed, * %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { if (RB_EMPTY_NODE(&kn->rb)) return false; down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; kernfs_inc_rev(kn->parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } /** * kernfs_get_active - get an active reference to kernfs_node * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn * is %NULL. * * Return: * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { if (unlikely(!kn)) return NULL; if (!atomic_inc_unless_negative(&kn->active)) return NULL; if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } /** * kernfs_put_active - put an active reference to kernfs_node * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) __releases(&kernfs_root(kn)->kernfs_rwsem) __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); /* * Skip draining if already fully drained. This avoids draining and its * lockdep annotations for nodes which have never been activated * allowing embedding kernfs_remove() in create error paths without * worrying about draining. */ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && !kernfs_should_drain_open_files(kn)) return; up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); } wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, _RET_IP_); } if (kernfs_should_drain_open_files(kn)) kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } /** * kernfs_get - get a reference count on a kernfs_node * @kn: the target kernfs_node */ void kernfs_get(struct kernfs_node *kn) { if (kn) { WARN_ON(!atomic_read(&kn->count)); atomic_inc(&kn->count); } } EXPORT_SYMBOL_GPL(kernfs_get); static void kernfs_free_rcu(struct rcu_head *rcu) { struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); kfree_const(kn->name); if (kn->iattr) { simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } kmem_cache_free(kernfs_node_cache, kn); } /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node * * Put a reference count of @kn and destroy it if it reached zero. */ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); repeat: /* * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kn->parent; WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", parent ? parent->name : "", kn->name, atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { if (atomic_dec_and_test(&kn->count)) goto repeat; } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry * is accessible, the returned node can be in any state and the caller is * fully responsible for determining what's accessible. */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) goto err_out1; idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); kn->name = name; kn->mode = mode; kn->flags = flags; if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = uid, .ia_gid = gid, }; ret = __kernfs_setattr(kn, &iattr); if (ret < 0) goto err_out3; } if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) goto err_out3; } return kn; err_out3: spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL; } struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; if (parent->mode & S_ISGID) { /* this code block imitates inode_init_owner() for * kernfs */ if (parent->iattr) gid = parent->iattr->ia_gid; if (flags & KERNFS_DIR) mode |= S_ISGID; } kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); kn->parent = parent; } return kn; } /* * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root * @id: the target node id * * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * * Return: %NULL on failure, * otherwise a kernfs node with reference counter incremented. */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); rcu_read_lock(); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; if (sizeof(ino_t) >= sizeof(u64)) { /* we looked up with the low 32bits, compare the whole */ if (kernfs_ino(kn) != ino) goto err_unlock; } else { /* 0 matches all generations */ if (unlikely(gen && kernfs_gen(kn) != gen)) goto err_unlock; } /* * We should fail if @kn has never been activated and guarantee success * if the caller knows that @kn is active. Both can be achieved by * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; rcu_read_unlock(); return kn; err_unlock: rcu_read_unlock(); return NULL; } /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * Return: * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; struct kernfs_root *root = kernfs_root(parent); struct kernfs_iattrs *ps_iattr; bool has_ns; int ret; down_write(&root->kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, kn->name)) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ down_write(&root->kernfs_iattr_rwsem); ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0; out_unlock: up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_find_ns - find kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, const void *ns) { struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, name); return NULL; } hash = kernfs_name_hash(name, ns); while (node) { struct kernfs_node *kn; int result; kn = rb_to_kn(node); result = kernfs_name_compare(hash, name, ns, kn); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return kn; } return NULL; } static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') continue; parent = kernfs_find_ns(parent, name, ns); } spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_walk_and_get_ns - find and get kernfs_node with the given path * @parent: kernfs_node to search under * @path: path to look for * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); /* * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. * High bits generation. The starting value for both ino and * genenration is 1. Initialize upper 32bit allocation * accordingly. */ if (sizeof(ino_t) >= sizeof(u64)) root->id_highbits = 0; else root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } kn->priv = priv; kn->dir.root = root; root->syscall_ops = scops; root->flags = flags; root->kn = kn; init_waitqueue_head(&root->deactivate_waitq); if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return root; } /** * kernfs_destroy_root - destroy a kernfs hierarchy * @root: root of the hierarchy to destroy * * Destroy the hierarchy anchored at @root by removing all existing * directories and destroying @root. */ void kernfs_destroy_root(struct kernfs_root *root) { /* * kernfs_remove holds kernfs_rwsem from the root so the root * shouldn't be freed during the operation. */ kernfs_get(root->kn); kernfs_remove(root->kn); kernfs_put(root->kn); /* will also free @root */ } /** * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root * @root: root to use to lookup * * Return: @root's kernfs_node */ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) { return root->kn; } /** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @uid: uid of the new directory * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, mode | S_IFDIR, uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->dir.root = parent->dir.root; kn->ns = ns; kn->priv = priv; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } /** * kernfs_create_empty_dir - create an always empty directory * @parent: parent in which to create a new directory * @name: name of the new directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->flags |= KERNFS_EMPTY_DIR; kn->dir.root = parent->dir.root; kn->ns = NULL; kn->priv = NULL; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; struct kernfs_root *root; if (flags & LOOKUP_RCU) return -ECHILD; /* Negative hashed dentry? */ if (d_really_is_negative(dentry)) { struct kernfs_node *parent; /* If the kernfs parent node has changed discard and * proceed to ->lookup. * * There's nothing special needed here when getting the * dentry parent, even if a concurrent rename is in * progress. That's because the dentry is negative so * it can only be the target of the rename and it will * be doing a d_move() not a replace. Consequently the * dentry d_parent won't change over the d_move(). * * Also kernfs negative dentries transitioning from * negative to positive during revalidate won't happen * because they are invalidated on containing directory * changes and the lookup re-done so that a new positive * dentry can be properly created. */ root = kernfs_root_from_sb(dentry->d_sb); down_read(&root->kernfs_rwsem); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { if (kernfs_dir_changed(parent, dentry)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. */ return 1; } kn = kernfs_dentry_node(dentry); root = kernfs_root(kn); down_read(&root->kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) goto out_bad; /* The kernfs node has been moved? */ if (kernfs_dentry_node(dentry->d_parent) != kn->parent) goto out_bad; /* The kernfs node has been renamed */ if (strcmp(dentry->d_name.name, kn->name) != 0) goto out_bad; /* The kernfs node has been moved to a different namespace */ if (kn->parent && kernfs_ns_enabled(kn->parent) && kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; up_read(&root->kernfs_rwsem); return 1; out_bad: up_read(&root->kernfs_rwsem); return 0; } const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, }; static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct kernfs_root *root; struct inode *inode = NULL; const void *ns = NULL; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ if (kn) { /* Inactive nodes are invisible to the VFS so don't * create a negative. */ if (!kernfs_active(kn)) { up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } /* * Needed for negative dentry validation. * The negative dentry can be created in kernfs_iop_lookup() * or transforms from positive dentry in dentry_unlink_inode() * called from vfs_rmdir(). */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&root->kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); } static int kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) return -EPERM; if (!kernfs_get_active(parent)) return -ENODEV; ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); return ret; } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (!scops || !scops->rmdir) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; ret = scops->rmdir(kn); kernfs_put_active(kn); return ret; } static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (flags) return -EINVAL; if (!scops || !scops->rename) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; if (!kernfs_get_active(new_parent)) { kernfs_put_active(kn); return -ENODEV; } ret = scops->rename(kn, new_parent, new_dentry->d_name.name); kernfs_put_active(new_parent); kernfs_put_active(kn); return ret; } const struct inode_operations kernfs_dir_iops = { .lookup = kernfs_iop_lookup, .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, .rmdir = kernfs_iop_rmdir, .rename = kernfs_iop_rename, }; static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) { struct kernfs_node *last; while (true) { struct rb_node *rbn; last = pos; if (kernfs_type(pos) != KERNFS_DIR) break; rbn = rb_first(&pos->dir.children); if (!rbn) break; pos = rb_to_kn(rbn); } return last; } /** * kernfs_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: kernfs_node whose descendants to walk * * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. * * Return: the next descendant to visit or %NULL when done. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) { struct rb_node *rbn; lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) return kernfs_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ rbn = rb_next(&pos->rb); if (rbn) return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ return pos->parent; } static void kernfs_activate_one(struct kernfs_node *kn) { lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); kn->flags |= KERNFS_ACTIVATED; if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) return; WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb)); WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); } /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated * * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node * needs to be explicitly activated. A node which hasn't been activated * isn't visible to userland and deactivation is skipped during its * removal. This is useful to construct atomic init sequences where * creation of multiple nodes should either succeed or fail atomically. * * The caller is responsible for ensuring that this function is not called * after kernfs_remove*() is invoked on @kn. */ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) kernfs_activate_one(pos); up_write(&root->kernfs_rwsem); } /** * kernfs_show - show or hide a node * @kn: kernfs_node to show or hide * @show: whether to show or hide * * If @show is %false, @kn is marked hidden and deactivated. A hidden node is * ignored in future activaitons. If %true, the mark is removed and activation * state is restored. This function won't implicitly activate a new node in a * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. * * To avoid recursion complexities, directories aren't supported for now. */ void kernfs_show(struct kernfs_node *kn, bool show) { struct kernfs_root *root = kernfs_root(kn); if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) return; down_write(&root->kernfs_rwsem); if (show) { kn->flags &= ~KERNFS_HIDDEN; if (kn->flags & KERNFS_ACTIVATED) kernfs_activate_one(kn); } else { kn->flags |= KERNFS_HIDDEN; if (kernfs_active(kn)) atomic_add(KN_DEACTIVATED_BIAS, &kn->active); kernfs_drain(kn); } up_write(&root->kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; /* Short-circuit if non-root @kn has already finished removal. */ if (!kn) return; lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* * This is for kernfs_remove_self() which plays with active ref * after removal. */ if (kn->parent && RB_EMPTY_NODE(&kn->rb)) return; pr_debug("kernfs %s: removing\n", kn->name); /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); kernfs_drain(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ if (!pos->parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = pos->parent ? pos->parent->iattr : NULL; /* update timestamps on the parent */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } kernfs_put(pos); } while (pos != kn); } /** * kernfs_remove - remove a kernfs_node recursively * @kn: the kernfs_node to remove * * Remove @kn along with all its subdirectories and files. */ void kernfs_remove(struct kernfs_node *kn) { struct kernfs_root *root; if (!kn) return; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); __kernfs_remove(kn); up_write(&root->kernfs_rwsem); } /** * kernfs_break_active_protection - break out of active protection * @kn: the self kernfs_node * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. Each invocation of * this function must also be matched with an invocation of * kernfs_unbreak_active_protection(). * * This function releases the active reference of @kn the caller is * holding. Once this function is called, @kn may be removed at any point * and the caller is solely responsible for ensuring that the objects it * dereferences are accessible. */ void kernfs_break_active_protection(struct kernfs_node *kn) { /* * Take out ourself out of the active ref dependency chain. If * we're called without an active ref, lockdep will complain. */ kernfs_put_active(kn); } /** * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() * @kn: the self kernfs_node * * If kernfs_break_active_protection() was called, this function must be * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of * being removed. Once kernfs_break_active_protection() is invoked, that * protection is irreversibly gone for the kernfs operation instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location * would be right before the enclosing kernfs operation returns. */ void kernfs_unbreak_active_protection(struct kernfs_node *kn) { /* * @kn->active could be in any state; however, the increment we do * here will be undone as soon as the enclosing kernfs operation * finishes and this temporary bump can't break anything. If @kn * is alive, nothing changes. If @kn is being deactivated, the * soon-to-follow put will either finish deactivation or restore * deactivated state. If @kn is already removed, the temporary * bump is guaranteed to be gone before @kn is released. */ atomic_inc(&kn->active); if (kernfs_lockdep(kn)) rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); } /** * kernfs_remove_self - remove a kernfs_node from its own method * @kn: the self kernfs_node to remove * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. This can be used to * implement a file operation which deletes itself. * * For example, the "delete" file for a sysfs device directory can be * implemented by invoking kernfs_remove_self() on the "delete" file * itself. This function breaks the circular dependency of trying to * deactivate self while holding an active ref itself. It isn't necessary * to modify the usual removal path to use kernfs_remove_self(). The * "delete" implementation can simply invoke kernfs_remove_self() on self * before proceeding with the usual removal path. kernfs will ignore later * kernfs_remove() on self. * * kernfs_remove_self() can be called multiple times concurrently on the * same kernfs_node. Only the first one actually performs removal and * returns %true. All others will wait until the kernfs operation which * won self-removal finishes and return %false. Note that the losers wait * for the completion of not only the winning kernfs_remove_self() but also * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. * * Return: %true if @kn is removed by this call, otherwise %false. */ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored * while kernfs_rwsem for held exclusive. The ones which lost * arbitration waits for SUICIDED && drained which can happen only * after the enclosing kernfs operation which executed the winning * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; __kernfs_remove(kn); kn->flags |= KERNFS_SUICIDED; ret = true; } else { wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; DEFINE_WAIT(wait); while (true) { prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); if ((kn->flags & KERNFS_SUICIDED) && atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; up_write(&root->kernfs_rwsem); schedule(); down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); ret = false; } /* * This must be done while kernfs_rwsem held exclusive; otherwise, * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it * @parent: parent of the target * @name: name of the kernfs_node to remove * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. * * Return: %0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", name); return -ENOENT; } root = kernfs_root(parent); down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) { kernfs_get(kn); __kernfs_remove(kn); kernfs_put(kn); } up_write(&root->kernfs_rwsem); if (kn) return 0; else return -ENOENT; } /** * kernfs_rename_ns - move and rename a kernfs_node * @kn: target node * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag * * Return: %0 on success, -errno on failure. */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; struct kernfs_root *root; const char *old_name = NULL; int error; /* can't move or rename root */ if (!kn->parent) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; error = 0; if ((kn->parent == new_parent) && (kn->ns == new_ns) && (strcmp(kn->name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; if (kernfs_find_ns(new_parent, new_name, new_ns)) goto out; /* rename kernfs_node */ if (strcmp(kn->name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) goto out; } else { new_name = NULL; } /* * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); kernfs_get(new_parent); /* rename_lock protects ->parent and ->name accessors */ write_lock_irq(&kernfs_rename_lock); old_parent = kn->parent; kn->parent = new_parent; kn->ns = new_ns; if (new_name) { old_name = kn->name; kn->name = new_name; } write_unlock_irq(&kernfs_rename_lock); kn->hash = kernfs_name_hash(kn->name, kn->ns); kernfs_link_sibling(kn); kernfs_put(old_parent); kfree_const(old_name); error = 0; out: up_write(&root->kernfs_rwsem); return error; } static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); return 0; } static struct kernfs_node *kernfs_dir_pos(const void *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { int valid = kernfs_active(pos) && pos->parent == parent && hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; } if (!pos && (hash > 1) && (hash < INT_MAX)) { struct rb_node *node = parent->dir.children.rb_node; while (node) { pos = rb_to_kn(node); if (hash < pos->hash) node = node->rb_left; else if (hash > pos->hash) node = node->rb_right; else break; } } /* Skip over entries which are dying/dead or in the wrong namespace */ while (pos && (!kernfs_active(pos) || pos->ns != ns)) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } return pos; } static struct kernfs_node *kernfs_dir_next_pos(const void *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } while (pos && (!kernfs_active(pos) || pos->ns != ns)); } return pos; } static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; struct kernfs_root *root; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = pos->name; unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; kernfs_get(pos); up_read(&root->kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; down_read(&root->kernfs_rwsem); } up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; } const struct file_operations kernfs_dir_fops = { .read = generic_read_dir, .iterate_shared = kernfs_fop_readdir, .release = kernfs_dir_fop_release, .llseek = generic_file_llseek, }; |
| 44 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_64_H #define _ASM_X86_PGTABLE_64_H #include <linux/const.h> #include <asm/pgtable_64_types.h> #ifndef __ASSEMBLY__ /* * This file contains the functions and defines necessary to modify and use * the x86-64 page table tree. */ #include <asm/processor.h> #include <linux/bitops.h> #include <linux/threads.h> #include <asm/fixmap.h> extern p4d_t level4_kernel_pgt[512]; extern p4d_t level4_ident_pgt[512]; extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt extern void paging_init(void); static inline void sync_initial_page_table(void) { } #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pte_val(e)) #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pud_ERROR(e) \ pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) #if CONFIG_PGTABLE_LEVELS >= 5 #define p4d_ERROR(e) \ pr_err("%s:%d: bad p4d %p(%016lx)\n", \ __FILE__, __LINE__, &(e), p4d_val(e)) #endif #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) struct mm_struct; #define mm_p4d_folded mm_p4d_folded static inline bool mm_p4d_folded(struct mm_struct *mm) { return !pgtable_l5_enabled(); } void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); static inline void native_set_pte(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); } static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { native_set_pte(ptep, native_make_pte(0)); } static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte(ptep, pte); } static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { WRITE_ONCE(*pmdp, pmd); } static inline void native_pmd_clear(pmd_t *pmd) { native_set_pmd(pmd, native_make_pmd(0)); } static inline pte_t native_ptep_get_and_clear(pte_t *xp) { #ifdef CONFIG_SMP return native_make_pte(xchg(&xp->pte, 0)); #else /* native_local_ptep_get_and_clear, but duplicated because of cyclic dependency */ pte_t ret = *xp; native_pte_clear(NULL, 0, xp); return ret; #endif } static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) { #ifdef CONFIG_SMP return native_make_pmd(xchg(&xp->pmd, 0)); #else /* native_local_pmdp_get_and_clear, but duplicated because of cyclic dependency */ pmd_t ret = *xp; native_pmd_clear(xp); return ret; #endif } static inline void native_set_pud(pud_t *pudp, pud_t pud) { WRITE_ONCE(*pudp, pud); } static inline void native_pud_clear(pud_t *pud) { native_set_pud(pud, native_make_pud(0)); } static inline pud_t native_pudp_get_and_clear(pud_t *xp) { #ifdef CONFIG_SMP return native_make_pud(xchg(&xp->pud, 0)); #else /* native_local_pudp_get_and_clear, * but duplicated because of cyclic dependency */ pud_t ret = *xp; native_pud_clear(xp); return ret; #endif } static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) { WRITE_ONCE(*p4dp, p4d); return; } pgd = native_make_pgd(native_p4d_val(p4d)); pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd); WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd))); } static inline void native_p4d_clear(p4d_t *p4d) { native_set_p4d(p4d, native_make_p4d(0)); } static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd)); } static inline void native_pgd_clear(pgd_t *pgd) { native_set_pgd(pgd, native_make_pgd(0)); } /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ /* PGD - Level 4 access */ /* PUD - Level 3 access */ /* PMD - Level 2 access */ /* PTE - Level 1 access */ /* * Encode and de-code a swap entry * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| E|F|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above * there. We also need to avoid using A and D because of an * erratum where they can be incorrectly set by hardware on * non-present PTEs. * * SD Bits 1-4 are not used in non-present format and available for * special use described below: * * SD (1) in swp entry is used to store soft dirty bit, which helps us * remember soft dirty over page migration * * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * * E (3) in swp entry is used to remember PG_anon_exclusive. * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. * * The offset is inverted by a binary not operation to make the high * physical bits set. */ #define SWP_TYPE_BITS 5 #define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) /* We always extract/encode the offset by shifting it all the way up, and then down again */ #define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) /* Extract the high bits for type */ #define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) /* Shift up (to get rid of type), then down to get value */ #define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) /* * Shift the offset up "too far" by TYPE bits, then down again * The offset is inverted by a binary not operation to make the high * physical bits set. */ #define __swp_entry(type, offset) ((swp_entry_t) { \ (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) #define __swp_entry_to_pte(x) (__pte((x).val)) #define __swp_entry_to_pmd(x) (__pmd((x).val)) extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #define HAVE_ARCH_UNMAPPED_AREA_VMFLAGS #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 /* fs/proc/kcore.c */ #define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) #define __HAVE_ARCH_PTE_SAME #define vmemmap ((struct page *)VMEMMAP_START) extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); #define gup_fast_permitted gup_fast_permitted static inline bool gup_fast_permitted(unsigned long start, unsigned long end) { if (end >> __VIRTUAL_MASK_SHIFT) return false; return true; } #include <asm/pgtable-invert.h> #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ |
| 17 4 9 5 5 9 9 9 9 9 4 7 7 6 7 7 5 5 5 5 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 4 2 1 9 9 8 1 9 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; } |
| 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | // SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> /* cgroup namespaces */ static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); } static void dec_cgroup_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); } static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); return ERR_PTR(ret); } refcount_set(&new_ns->ns.count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } EXPORT_SYMBOL(free_cgroup_ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); if (!(flags & CLONE_NEWCGROUP)) { get_cgroup_ns(old_ns); return old_ns; } /* Allow only sysadmin to create cgroup namespace. */ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; } static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) { return container_of(ns, struct cgroup_namespace, ns); } static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Don't need to do anything if we are attaching to our own cgroupns. */ if (cgroup_ns == nsproxy->cgroup_ns) return 0; get_cgroup_ns(cgroup_ns); put_cgroup_ns(nsproxy->cgroup_ns); nsproxy->cgroup_ns = cgroup_ns; return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) { struct cgroup_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->cgroup_ns; get_cgroup_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void cgroupns_put(struct ns_common *ns) { put_cgroup_ns(to_cg_ns(ns)); } static struct user_namespace *cgroupns_owner(struct ns_common *ns) { return to_cg_ns(ns)->user_ns; } const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, .owner = cgroupns_owner, }; |
| 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | #ifndef __LZ4DEFS_H__ #define __LZ4DEFS_H__ /* * lz4defs.h -- common and architecture specific defines for the kernel usage * LZ4 - Fast LZ compression algorithm * Copyright (C) 2011-2016, Yann Collet. * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * You can contact the author at : * - LZ4 homepage : http://www.lz4.org * - LZ4 source repository : https://github.com/lz4/lz4 * * Changed for kernel usage by: * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> */ #include <asm/unaligned.h> #include <linux/bitops.h> #include <linux/string.h> /* memset, memcpy */ #define FORCE_INLINE __always_inline /*-************************************ * Basic Types **************************************/ #include <linux/types.h> typedef uint8_t BYTE; typedef uint16_t U16; typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; typedef uintptr_t uptrval; /*-************************************ * Architecture specifics **************************************/ #if defined(CONFIG_64BIT) #define LZ4_ARCH64 1 #else #define LZ4_ARCH64 0 #endif #if defined(__LITTLE_ENDIAN) #define LZ4_LITTLE_ENDIAN 1 #else #define LZ4_LITTLE_ENDIAN 0 #endif /*-************************************ * Constants **************************************/ #define MINMATCH 4 #define WILDCOPYLENGTH 8 #define LASTLITERALS 5 #define MFLIMIT (WILDCOPYLENGTH + MINMATCH) /* * ensure it's possible to write 2 x wildcopyLength * without overflowing output buffer */ #define MATCH_SAFEGUARD_DISTANCE ((2 * WILDCOPYLENGTH) - MINMATCH) /* Increase this value ==> compression run slower on incompressible data */ #define LZ4_SKIPTRIGGER 6 #define HASH_UNIT sizeof(size_t) #define KB (1 << 10) #define MB (1 << 20) #define GB (1U << 30) #define MAXD_LOG 16 #define MAX_DISTANCE ((1 << MAXD_LOG) - 1) #define STEPSIZE sizeof(size_t) #define ML_BITS 4 #define ML_MASK ((1U << ML_BITS) - 1) #define RUN_BITS (8 - ML_BITS) #define RUN_MASK ((1U << RUN_BITS) - 1) /*-************************************ * Reading and writing into memory **************************************/ static FORCE_INLINE U16 LZ4_read16(const void *ptr) { return get_unaligned((const U16 *)ptr); } static FORCE_INLINE U32 LZ4_read32(const void *ptr) { return get_unaligned((const U32 *)ptr); } static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr) { return get_unaligned((const size_t *)ptr); } static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value) { put_unaligned(value, (U16 *)memPtr); } static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value) { put_unaligned(value, (U32 *)memPtr); } static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); } static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value) { return put_unaligned_le16(value, memPtr); } /* * LZ4 relies on memcpy with a constant size being inlined. In freestanding * environments, the compiler can't assume the implementation of memcpy() is * standard compliant, so apply its specialized memcpy() inlining logic. When * possible, use __builtin_memcpy() to tell the compiler to analyze memcpy() * as-if it were standard compliant, so it can inline it in freestanding * environments. This is needed when decompressing the Linux Kernel, for example. */ #define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) #define LZ4_memmove(dst, src, size) __builtin_memmove(dst, src, size) static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) { #if LZ4_ARCH64 U64 a = get_unaligned((const U64 *)src); put_unaligned(a, (U64 *)dst); #else U32 a = get_unaligned((const U32 *)src); U32 b = get_unaligned((const U32 *)src + 1); put_unaligned(a, (U32 *)dst); put_unaligned(b, (U32 *)dst + 1); #endif } /* * customized variant of memcpy, * which can overwrite up to 7 bytes beyond dstEnd */ static FORCE_INLINE void LZ4_wildCopy(void *dstPtr, const void *srcPtr, void *dstEnd) { BYTE *d = (BYTE *)dstPtr; const BYTE *s = (const BYTE *)srcPtr; BYTE *const e = (BYTE *)dstEnd; do { LZ4_copy8(d, s); d += 8; s += 8; } while (d < e); } static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val) { #if LZ4_LITTLE_ENDIAN return __ffs(val) >> 3; #else return (BITS_PER_LONG - 1 - __fls(val)) >> 3; #endif } static FORCE_INLINE unsigned int LZ4_count( const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE *const pStart = pIn; while (likely(pIn < pInLimit - (STEPSIZE - 1))) { size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); if (!diff) { pIn += STEPSIZE; pMatch += STEPSIZE; continue; } pIn += LZ4_NbCommonBytes(diff); return (unsigned int)(pIn - pStart); } #if LZ4_ARCH64 if ((pIn < (pInLimit - 3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn += 4; pMatch += 4; } #endif if ((pIn < (pInLimit - 1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn += 2; pMatch += 2; } if ((pIn < pInLimit) && (*pMatch == *pIn)) pIn++; return (unsigned int)(pIn - pStart); } typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive; typedef enum { byPtr, byU32, byU16 } tableType_t; typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; #define LZ4_STATIC_ASSERT(c) BUILD_BUG_ON(!(c)) #endif |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM fib #if !defined(_TRACE_FIB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FIB_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <net/ip_fib.h> #include <linux/tracepoint.h> TRACE_EVENT(fib_table_lookup, TP_PROTO(u32 tb_id, const struct flowi4 *flp, const struct fib_nh_common *nhc, int err), TP_ARGS(tb_id, flp, nhc, err), TP_STRUCT__entry( __field( u32, tb_id ) __field( int, err ) __field( int, oif ) __field( int, iif ) __field( u8, proto ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) __array( __u8, src, 4 ) __array( __u8, dst, 4 ) __array( __u8, gw4, 4 ) __array( __u8, gw6, 16 ) __field( u16, sport ) __field( u16, dport ) __array(char, name, IFNAMSIZ ) ), TP_fast_assign( struct net_device *dev; struct in6_addr *in6; __be32 *p32; __entry->tb_id = tb_id; __entry->err = err; __entry->oif = flp->flowi4_oif; __entry->iif = flp->flowi4_iif; __entry->tos = flp->flowi4_tos; __entry->scope = flp->flowi4_scope; __entry->flags = flp->flowi4_flags; p32 = (__be32 *) __entry->src; *p32 = flp->saddr; p32 = (__be32 *) __entry->dst; *p32 = flp->daddr; __entry->proto = flp->flowi4_proto; if (__entry->proto == IPPROTO_TCP || __entry->proto == IPPROTO_UDP) { __entry->sport = ntohs(flp->fl4_sport); __entry->dport = ntohs(flp->fl4_dport); } else { __entry->sport = 0; __entry->dport = 0; } dev = nhc ? nhc->nhc_dev : NULL; strscpy(__entry->name, dev ? dev->name : "-", IFNAMSIZ); if (nhc) { if (nhc->nhc_gw_family == AF_INET) { p32 = (__be32 *) __entry->gw4; *p32 = nhc->nhc_gw.ipv4; in6 = (struct in6_addr *)__entry->gw6; *in6 = in6addr_any; } else if (nhc->nhc_gw_family == AF_INET6) { p32 = (__be32 *) __entry->gw4; *p32 = 0; in6 = (struct in6_addr *)__entry->gw6; *in6 = nhc->nhc_gw.ipv6; } } else { p32 = (__be32 *) __entry->gw4; *p32 = 0; in6 = (struct in6_addr *)__entry->gw6; *in6 = in6addr_any; } ), TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4/%pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, __entry->tos, __entry->scope, __entry->flags, __entry->name, __entry->gw4, __entry->gw6, __entry->err) ); #endif /* _TRACE_FIB_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 637 635 636 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | // SPDX-License-Identifier: LGPL-2.0+ /* * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. * This file is part of the GNU C Library. * Contributed by Paul Eggert (eggert@twinsun.com). * * The GNU C Library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * The GNU C Library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with the GNU C Library; see the file COPYING.LIB. If not, * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ /* * Converts the calendar time to broken-down time representation * * 2009-7-14: * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> * 2021-06-02: * Reimplemented by Cassio Neri <cassio.neri@gmail.com> */ #include <linux/time.h> #include <linux/module.h> #include <linux/kernel.h> #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) /** * time64_to_tm - converts the calendar time to local broken-down time * * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). * @offset: offset seconds adding to totalsecs. * @result: pointer to struct tm variable to receive broken-down time */ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day; u64 u64tmp, udays, century, year; bool is_Jan_or_Feb, is_leap_year; long days, rem; int remainder; days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder); rem = remainder; rem += offset; while (rem < 0) { rem += SECS_PER_DAY; --days; } while (rem >= SECS_PER_DAY) { rem -= SECS_PER_DAY; ++days; } result->tm_hour = rem / SECS_PER_HOUR; rem %= SECS_PER_HOUR; result->tm_min = rem / 60; result->tm_sec = rem % 60; /* January 1, 1970 was a Thursday. */ result->tm_wday = (4 + days) % 7; if (result->tm_wday < 0) result->tm_wday += 7; /* * The following algorithm is, basically, Proposition 6.3 of Neri * and Schneider [1]. In a few words: it works on the computational * (fictitious) calendar where the year starts in March, month = 2 * (*), and finishes in February, month = 13. This calendar is * mathematically convenient because the day of the year does not * depend on whether the year is leap or not. For instance: * * March 1st 0-th day of the year; * ... * April 1st 31-st day of the year; * ... * January 1st 306-th day of the year; (Important!) * ... * February 28th 364-th day of the year; * February 29th 365-th day of the year (if it exists). * * After having worked out the date in the computational calendar * (using just arithmetics) it's easy to convert it to the * corresponding date in the Gregorian calendar. * * [1] "Euclidean Affine Functions and Applications to Calendar * Algorithms". https://arxiv.org/abs/2102.06959 * * (*) The numbering of months follows tm more closely and thus, * is slightly different from [1]. */ udays = ((u64) days) + 2305843009213814918ULL; u64tmp = 4 * udays + 3; century = div64_u64_rem(u64tmp, 146097, &u64tmp); day_of_century = (u32) (u64tmp / 4); u32tmp = 4 * day_of_century + 3; u64tmp = 2939745ULL * u32tmp; year_of_century = upper_32_bits(u64tmp); day_of_year = lower_32_bits(u64tmp) / 2939745 / 4; year = 100 * century + year_of_century; is_leap_year = year_of_century ? !(year_of_century % 4) : !(century % 4); u32tmp = 2141 * day_of_year + 132377; month = u32tmp >> 16; day = ((u16) u32tmp) / 2141; /* * Recall that January 1st is the 306-th day of the year in the * computational (not Gregorian) calendar. */ is_Jan_or_Feb = day_of_year >= 306; /* Convert to the Gregorian calendar and adjust to Unix time. */ year = year + is_Jan_or_Feb - 6313183731940000ULL; month = is_Jan_or_Feb ? month - 12 : month; day = day + 1; day_of_year += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year; /* Convert to tm's format. */ result->tm_year = (long) (year - 1900); result->tm_mon = (int) month; result->tm_mday = (int) day; result->tm_yday = (int) day_of_year; } EXPORT_SYMBOL(time64_to_tm); |
| 4 4 4 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 | // SPDX-License-Identifier: GPL-1.0+ /* * originally based on the dummy device. * * Copyright 1999, Thomas Davis, tadavis@lbl.gov. * Based on dummy.c, and eql.c devices. * * bonding.c: an Ethernet Bonding driver * * This is useful to talk to a Cisco EtherChannel compatible equipment: * Cisco 5500 * Sun Trunking (Solaris) * Alteon AceDirector Trunks * Linux Bonding * and probably many L2 switches ... * * How it works: * ifconfig bond0 ipaddress netmask up * will setup a network device, with an ip address. No mac address * will be assigned at this time. The hw mac address will come from * the first slave bonded to the channel. All slaves will then use * this hw mac address. * * ifconfig bond0 down * will release all slaves, marking them as down. * * ifenslave bond0 eth0 * will attach eth0 to bond0 as a slave. eth0 hw mac address will either * a: be used as initial mac address * b: if a hw mac address already is there, eth0's hw mac address * will then be set from bond0. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/filter.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/ioport.h> #include <linux/in.h> #include <net/ip.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/init.h> #include <linux/timer.h> #include <linux/socket.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/bitops.h> #include <linux/io.h> #include <asm/dma.h> #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/rtnetlink.h> #include <linux/smp.h> #include <linux/if_ether.h> #include <net/arp.h> #include <linux/mii.h> #include <linux/ethtool.h> #include <linux/if_vlan.h> #include <linux/if_bonding.h> #include <linux/phy.h> #include <linux/jiffies.h> #include <linux/preempt.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/pkt_sched.h> #include <linux/rculist.h> #include <net/flow_dissector.h> #include <net/xfrm.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #if IS_ENABLED(CONFIG_TLS_DEVICE) #include <net/tls.h> #endif #include <net/ip6_route.h> #include <net/xdp.h> #include "bonding_priv.h" /*---------------------------- Module parameters ----------------------------*/ /* monitor all links that often (in milliseconds). <=0 disables monitoring */ static int max_bonds = BOND_DEFAULT_MAX_BONDS; static int tx_queues = BOND_DEFAULT_TX_QUEUES; static int num_peer_notif = 1; static int miimon; static int updelay; static int downdelay; static int use_carrier = 1; static char *mode; static char *primary; static char *primary_reselect; static char *lacp_rate; static int min_links; static char *ad_select; static char *xmit_hash_policy; static int arp_interval; static char *arp_ip_target[BOND_MAX_ARP_TARGETS]; static char *arp_validate; static char *arp_all_targets; static char *fail_over_mac; static int all_slaves_active; static struct bond_params bonding_defaults; static int resend_igmp = BOND_DEFAULT_RESEND_IGMP; static int packets_per_slave = 1; static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL; module_param(max_bonds, int, 0); MODULE_PARM_DESC(max_bonds, "Max number of bonded devices"); module_param(tx_queues, int, 0); MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)"); module_param_named(num_grat_arp, num_peer_notif, int, 0644); MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on " "failover event (alias of num_unsol_na)"); module_param_named(num_unsol_na, num_peer_notif, int, 0644); MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on " "failover event (alias of num_grat_arp)"); module_param(miimon, int, 0); MODULE_PARM_DESC(miimon, "Link check interval in milliseconds"); module_param(updelay, int, 0); MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds"); module_param(downdelay, int, 0); MODULE_PARM_DESC(downdelay, "Delay before considering link down, " "in milliseconds"); module_param(use_carrier, int, 0); MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; " "0 for off, 1 for on (default)"); module_param(mode, charp, 0); MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, " "1 for active-backup, 2 for balance-xor, " "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, " "6 for balance-alb"); module_param(primary, charp, 0); MODULE_PARM_DESC(primary, "Primary network device to use"); module_param(primary_reselect, charp, 0); MODULE_PARM_DESC(primary_reselect, "Reselect primary slave " "once it comes up; " "0 for always (default), " "1 for only if speed of primary is " "better, " "2 for only on active slave " "failure"); module_param(lacp_rate, charp, 0); MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; " "0 for slow, 1 for fast"); module_param(ad_select, charp, 0); MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; " "0 for stable (default), 1 for bandwidth, " "2 for count"); module_param(min_links, int, 0); MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier"); module_param(xmit_hash_policy, charp, 0); MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; " "0 for layer 2 (default), 1 for layer 3+4, " "2 for layer 2+3, 3 for encap layer 2+3, " "4 for encap layer 3+4, 5 for vlan+srcmac"); module_param(arp_interval, int, 0); MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds"); module_param_array(arp_ip_target, charp, NULL, 0); MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form"); module_param(arp_validate, charp, 0); MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; " "0 for none (default), 1 for active, " "2 for backup, 3 for all"); module_param(arp_all_targets, charp, 0); MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all"); module_param(fail_over_mac, charp, 0); MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to " "the same MAC; 0 for none (default), " "1 for active, 2 for follow"); module_param(all_slaves_active, int, 0); MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface " "by setting active flag for all slaves; " "0 for never (default), 1 for always."); module_param(resend_igmp, int, 0); MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on " "link failure"); module_param(packets_per_slave, int, 0); MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr " "mode; 0 for a random slave, 1 packet per " "slave (default), >1 packets per slave."); module_param(lp_interval, uint, 0); MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where " "the bonding driver sends learning packets to " "each slaves peer switch. The default is 1."); /*----------------------------- Global variables ----------------------------*/ #ifdef CONFIG_NET_POLL_CONTROLLER atomic_t netpoll_block_tx = ATOMIC_INIT(0); #endif unsigned int bond_net_id __read_mostly; static const struct flow_dissector_key flow_keys_bonding_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_ICMP, .offset = offsetof(struct flow_keys, icmp), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static struct flow_dissector flow_keys_bonding __read_mostly; /*-------------------------- Forward declarations ---------------------------*/ static int bond_init(struct net_device *bond_dev); static void bond_uninit(struct net_device *bond_dev); static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats); static void bond_slave_arr_handler(struct work_struct *work); static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod); static void bond_netdev_notify_work(struct work_struct *work); /*---------------------------- General routines -----------------------------*/ const char *bond_mode_name(int mode) { static const char *names[] = { [BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)", [BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)", [BOND_MODE_XOR] = "load balancing (xor)", [BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)", [BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation", [BOND_MODE_TLB] = "transmit load balancing", [BOND_MODE_ALB] = "adaptive load balancing", }; if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB) return "unknown"; return names[mode]; } /** * bond_dev_queue_xmit - Prepare skb for xmit. * * @bond: bond device that got this skb for tx. * @skb: hw accel VLAN tagged skb to transmit * @slave_dev: slave that is supposed to xmit this skbuff */ netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev) { skb->dev = slave_dev; BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); if (unlikely(netpoll_tx_running(bond->dev))) return bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); return dev_queue_xmit(skb); } static bool bond_sk_check(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_8023AD: case BOND_MODE_XOR: if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34) return true; fallthrough; default: return false; } } static bool bond_xdp_check(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: case BOND_MODE_ACTIVEBACKUP: return true; case BOND_MODE_8023AD: case BOND_MODE_XOR: /* vlan+srcmac is not supported with XDP as in most cases the 802.1q * payload is not in the packet due to hardware offload. */ if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC) return true; fallthrough; default: return false; } } /*---------------------------------- VLAN -----------------------------------*/ /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid, * We don't protect the slave list iteration with a lock because: * a. This operation is performed in IOCTL context, * b. The operation is protected by the RTNL semaphore in the 8021q code, * c. Holding a lock with BH disabled while directly calling a base driver * entry point is generally a BAD idea. * * The design of synchronization/protection for this operation in the 8021q * module is good for one or more VLAN devices over a single physical device * and cannot be extended for a teaming solution like bonding, so there is a * potential race condition here where a net device from the vlan group might * be referenced (either by a base driver or the 8021q code) while it is being * removed from the system. However, it turns out we're not making matters * worse, and if it works for regular VLAN usage it will work here too. */ /** * bond_vlan_rx_add_vid - Propagates adding an id to slaves * @bond_dev: bonding net device that got called * @proto: network protocol ID * @vid: vlan id being added */ static int bond_vlan_rx_add_vid(struct net_device *bond_dev, __be16 proto, u16 vid) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct list_head *iter; int res; bond_for_each_slave(bond, slave, iter) { res = vlan_vid_add(slave->dev, proto, vid); if (res) goto unwind; } return 0; unwind: /* unwind to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { if (rollback_slave == slave) break; vlan_vid_del(rollback_slave->dev, proto, vid); } return res; } /** * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves * @bond_dev: bonding net device that got called * @proto: network protocol ID * @vid: vlan id being removed */ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev, __be16 proto, u16 vid) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) vlan_vid_del(slave->dev, proto, vid); if (bond_is_lb(bond)) bond_alb_clear_vlan(bond, vid); return 0; } /*---------------------------------- XFRM -----------------------------------*/ #ifdef CONFIG_XFRM_OFFLOAD /** * bond_ipsec_add_sa - program device with a security association * @xs: pointer to transformer state struct * @extack: extack point to fill failure reason **/ static int bond_ipsec_add_sa(struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; int err; if (!bond_dev) return -EINVAL; rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; netdev_hold(real_dev, &tracker, GFP_ATOMIC); rcu_read_unlock(); if (!real_dev) { err = -ENODEV; goto out; } if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { NL_SET_ERR_MSG_MOD(extack, "Slave does not support ipsec offload"); err = -EINVAL; goto out; } ipsec = kmalloc(sizeof(*ipsec), GFP_KERNEL); if (!ipsec) { err = -ENOMEM; goto out; } xs->xso.real_dev = real_dev; err = real_dev->xfrmdev_ops->xdo_dev_state_add(xs, extack); if (!err) { ipsec->xs = xs; INIT_LIST_HEAD(&ipsec->list); mutex_lock(&bond->ipsec_lock); list_add(&ipsec->list, &bond->ipsec_list); mutex_unlock(&bond->ipsec_lock); } else { kfree(ipsec); } out: netdev_put(real_dev, &tracker); return err; } static void bond_ipsec_add_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct net_device *real_dev; struct bond_ipsec *ipsec; struct slave *slave; slave = rtnl_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; if (!real_dev) return; mutex_lock(&bond->ipsec_lock); if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { if (!list_empty(&bond->ipsec_list)) slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_add\n", __func__); goto out; } list_for_each_entry(ipsec, &bond->ipsec_list, list) { /* If new state is added before ipsec_lock acquired */ if (ipsec->xs->xso.real_dev == real_dev) continue; ipsec->xs->xso.real_dev = real_dev; if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } } out: mutex_unlock(&bond->ipsec_lock); } /** * bond_ipsec_del_sa - clear out this specific SA * @xs: pointer to transformer state struct **/ static void bond_ipsec_del_sa(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; if (!bond_dev) return; rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; netdev_hold(real_dev, &tracker, GFP_ATOMIC); rcu_read_unlock(); if (!slave) goto out; if (!xs->xso.real_dev) goto out; WARN_ON(xs->xso.real_dev != real_dev); if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) { slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); goto out; } real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: netdev_put(real_dev, &tracker); mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (ipsec->xs == xs) { list_del(&ipsec->list); kfree(ipsec); break; } } mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_del_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct net_device *real_dev; struct bond_ipsec *ipsec; struct slave *slave; slave = rtnl_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; if (!real_dev) return; mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (!ipsec->xs->xso.real_dev) continue; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) { slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); } else { real_dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs); if (real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); } } mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_free_sa(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; struct bonding *bond; struct slave *slave; if (!bond_dev) return; rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; netdev_hold(real_dev, &tracker, GFP_ATOMIC); rcu_read_unlock(); if (!slave) goto out; if (!xs->xso.real_dev) goto out; WARN_ON(xs->xso.real_dev != real_dev); if (real_dev && real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(xs); out: netdev_put(real_dev, &tracker); } /** * bond_ipsec_offload_ok - can this packet use the xfrm hw offload * @skb: current data packet * @xs: pointer to transformer state struct **/ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; struct slave *curr_active; struct bonding *bond; bool ok = false; bond = netdev_priv(bond_dev); rcu_read_lock(); curr_active = rcu_dereference(bond->curr_active_slave); if (!curr_active) goto out; real_dev = curr_active->dev; if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) goto out; if (!xs->xso.real_dev) goto out; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_offload_ok || netif_is_bond_master(real_dev)) goto out; ok = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs); out: rcu_read_unlock(); return ok; } static const struct xfrmdev_ops bond_xfrmdev_ops = { .xdo_dev_state_add = bond_ipsec_add_sa, .xdo_dev_state_delete = bond_ipsec_del_sa, .xdo_dev_state_free = bond_ipsec_free_sa, .xdo_dev_offload_ok = bond_ipsec_offload_ok, }; #endif /* CONFIG_XFRM_OFFLOAD */ /*------------------------------- Link status -------------------------------*/ /* Set the carrier state for the master according to the state of its * slaves. If any slaves are up, the master is up. In 802.3ad mode, * do special 802.3ad magic. * * Returns zero if carrier state does not change, nonzero if it does. */ int bond_set_carrier(struct bonding *bond) { struct list_head *iter; struct slave *slave; if (!bond_has_slaves(bond)) goto down; if (BOND_MODE(bond) == BOND_MODE_8023AD) return bond_3ad_set_carrier(bond); bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) { if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); return 1; } return 0; } } down: if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); return 1; } return 0; } /* Get link speed and duplex from the slave's base driver * using ethtool. If for some reason the call fails or the * values are invalid, set speed and duplex to -1, * and return. Return 1 if speed or duplex settings are * UNKNOWN; 0 otherwise. */ static int bond_update_speed_duplex(struct slave *slave) { struct net_device *slave_dev = slave->dev; struct ethtool_link_ksettings ecmd; int res; slave->speed = SPEED_UNKNOWN; slave->duplex = DUPLEX_UNKNOWN; res = __ethtool_get_link_ksettings(slave_dev, &ecmd); if (res < 0) return 1; if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) return 1; switch (ecmd.base.duplex) { case DUPLEX_FULL: case DUPLEX_HALF: break; default: return 1; } slave->speed = ecmd.base.speed; slave->duplex = ecmd.base.duplex; return 0; } const char *bond_slave_link_status(s8 link) { switch (link) { case BOND_LINK_UP: return "up"; case BOND_LINK_FAIL: return "going down"; case BOND_LINK_DOWN: return "down"; case BOND_LINK_BACK: return "going back"; default: return "unknown"; } } /* if <dev> supports MII link status reporting, check its link status. * * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(), * depending upon the setting of the use_carrier parameter. * * Return either BMSR_LSTATUS, meaning that the link is up (or we * can't tell and just pretend it is), or 0, meaning that the link is * down. * * If reporting is non-zero, instead of faking link up, return -1 if * both ETHTOOL and MII ioctls fail (meaning the device does not * support them). If use_carrier is set, return whatever it says. * It'd be nice if there was a good way to tell if a driver supports * netif_carrier, but there really isn't. */ static int bond_check_dev_link(struct bonding *bond, struct net_device *slave_dev, int reporting) { const struct net_device_ops *slave_ops = slave_dev->netdev_ops; int (*ioctl)(struct net_device *, struct ifreq *, int); struct ifreq ifr; struct mii_ioctl_data *mii; if (!reporting && !netif_running(slave_dev)) return 0; if (bond->params.use_carrier) return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0; /* Try to get link status using Ethtool first. */ if (slave_dev->ethtool_ops->get_link) return slave_dev->ethtool_ops->get_link(slave_dev) ? BMSR_LSTATUS : 0; /* Ethtool can't be used, fallback to MII ioctls. */ ioctl = slave_ops->ndo_eth_ioctl; if (ioctl) { /* TODO: set pointer to correct ioctl on a per team member * bases to make this more efficient. that is, once * we determine the correct ioctl, we will always * call it and not the others for that team * member. */ /* We cannot assume that SIOCGMIIPHY will also read a * register; not all network drivers (e.g., e100) * support that. */ /* Yes, the mii is overlaid on the ifreq.ifr_ifru */ strscpy_pad(ifr.ifr_name, slave_dev->name, IFNAMSIZ); mii = if_mii(&ifr); if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) { mii->reg_num = MII_BMSR; if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0) return mii->val_out & BMSR_LSTATUS; } } /* If reporting, report that either there's no ndo_eth_ioctl, * or both SIOCGMIIREG and get_link failed (meaning that we * cannot report link status). If not reporting, pretend * we're ok. */ return reporting ? -1 : BMSR_LSTATUS; } /*----------------------------- Multicast list ------------------------------*/ /* Push the promiscuity flag down to appropriate slaves */ static int bond_set_promiscuity(struct bonding *bond, int inc) { struct list_head *iter; int err = 0; if (bond_uses_primary(bond)) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active) err = dev_set_promiscuity(curr_active->dev, inc); } else { struct slave *slave; bond_for_each_slave(bond, slave, iter) { err = dev_set_promiscuity(slave->dev, inc); if (err) return err; } } return err; } /* Push the allmulti flag down to all slaves */ static int bond_set_allmulti(struct bonding *bond, int inc) { struct list_head *iter; int err = 0; if (bond_uses_primary(bond)) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active) err = dev_set_allmulti(curr_active->dev, inc); } else { struct slave *slave; bond_for_each_slave(bond, slave, iter) { err = dev_set_allmulti(slave->dev, inc); if (err) return err; } } return err; } /* Retrieve the list of registered multicast addresses for the bonding * device and retransmit an IGMP JOIN request to the current active * slave. */ static void bond_resend_igmp_join_requests_delayed(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, mcast_work.work); if (!rtnl_trylock()) { queue_delayed_work(bond->wq, &bond->mcast_work, 1); return; } call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev); if (bond->igmp_retrans > 1) { bond->igmp_retrans--; queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5); } rtnl_unlock(); } /* Flush bond's hardware addresses from slave */ static void bond_hw_addr_flush(struct net_device *bond_dev, struct net_device *slave_dev) { struct bonding *bond = netdev_priv(bond_dev); dev_uc_unsync(slave_dev, bond_dev); dev_mc_unsync(slave_dev, bond_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) dev_mc_del(slave_dev, lacpdu_mcast_addr); } /*--------------------------- Active slave change ---------------------------*/ /* Update the hardware address list and promisc/allmulti for the new and * old active slaves (if any). Modes that are not using primary keep all * slaves up date at all times; only the modes that use primary need to call * this function to swap these settings during a failover. */ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, struct slave *old_active) { if (old_active) { if (bond->dev->flags & IFF_PROMISC) dev_set_promiscuity(old_active->dev, -1); if (bond->dev->flags & IFF_ALLMULTI) dev_set_allmulti(old_active->dev, -1); if (bond->dev->flags & IFF_UP) bond_hw_addr_flush(bond->dev, old_active->dev); } if (new_active) { /* FIXME: Signal errors upstream. */ if (bond->dev->flags & IFF_PROMISC) dev_set_promiscuity(new_active->dev, 1); if (bond->dev->flags & IFF_ALLMULTI) dev_set_allmulti(new_active->dev, 1); if (bond->dev->flags & IFF_UP) { netif_addr_lock_bh(bond->dev); dev_uc_sync(new_active->dev, bond->dev); dev_mc_sync(new_active->dev, bond->dev); netif_addr_unlock_bh(bond->dev); } } } /** * bond_set_dev_addr - clone slave's address to bond * @bond_dev: bond net device * @slave_dev: slave net device * * Should be called with RTNL held. */ static int bond_set_dev_addr(struct net_device *bond_dev, struct net_device *slave_dev) { int err; slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n", bond_dev, slave_dev, slave_dev->addr_len); err = dev_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL); if (err) return err; __dev_addr_set(bond_dev, slave_dev->dev_addr, slave_dev->addr_len); bond_dev->addr_assign_type = NET_ADDR_STOLEN; call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev); return 0; } static struct slave *bond_get_old_active(struct bonding *bond, struct slave *new_active) { struct slave *slave; struct list_head *iter; bond_for_each_slave(bond, slave, iter) { if (slave == new_active) continue; if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr)) return slave; } return NULL; } /* bond_do_fail_over_mac * * Perform special MAC address swapping for fail_over_mac settings * * Called with RTNL */ static void bond_do_fail_over_mac(struct bonding *bond, struct slave *new_active, struct slave *old_active) { u8 tmp_mac[MAX_ADDR_LEN]; struct sockaddr_storage ss; int rv; switch (bond->params.fail_over_mac) { case BOND_FOM_ACTIVE: if (new_active) { rv = bond_set_dev_addr(bond->dev, new_active->dev); if (rv) slave_err(bond->dev, new_active->dev, "Error %d setting bond MAC from slave\n", -rv); } break; case BOND_FOM_FOLLOW: /* if new_active && old_active, swap them * if just old_active, do nothing (going to no active slave) * if just new_active, set new_active to bond's MAC */ if (!new_active) return; if (!old_active) old_active = bond_get_old_active(bond, new_active); if (old_active) { bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr, new_active->dev->addr_len); bond_hw_addr_copy(ss.__data, old_active->dev->dev_addr, old_active->dev->addr_len); ss.ss_family = new_active->dev->type; } else { bond_hw_addr_copy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; } rv = dev_set_mac_address(new_active->dev, (struct sockaddr *)&ss, NULL); if (rv) { slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n", -rv); goto out; } if (!old_active) goto out; bond_hw_addr_copy(ss.__data, tmp_mac, new_active->dev->addr_len); ss.ss_family = old_active->dev->type; rv = dev_set_mac_address(old_active->dev, (struct sockaddr *)&ss, NULL); if (rv) slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n", -rv); out: break; default: netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n", bond->params.fail_over_mac); break; } } /** * bond_choose_primary_or_current - select the primary or high priority slave * @bond: our bonding struct * * - Check if there is a primary link. If the primary link was set and is up, * go on and do link reselection. * * - If primary link is not set or down, find the highest priority link. * If the highest priority link is not current slave, set it as primary * link and do link reselection. */ static struct slave *bond_choose_primary_or_current(struct bonding *bond) { struct slave *prim = rtnl_dereference(bond->primary_slave); struct slave *curr = rtnl_dereference(bond->curr_active_slave); struct slave *slave, *hprio = NULL; struct list_head *iter; if (!prim || prim->link != BOND_LINK_UP) { bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) { hprio = hprio ?: slave; if (slave->prio > hprio->prio) hprio = slave; } } if (hprio && hprio != curr) { prim = hprio; goto link_reselect; } if (!curr || curr->link != BOND_LINK_UP) return NULL; return curr; } if (bond->force_primary) { bond->force_primary = false; return prim; } link_reselect: if (!curr || curr->link != BOND_LINK_UP) return prim; /* At this point, prim and curr are both up */ switch (bond->params.primary_reselect) { case BOND_PRI_RESELECT_ALWAYS: return prim; case BOND_PRI_RESELECT_BETTER: if (prim->speed < curr->speed) return curr; if (prim->speed == curr->speed && prim->duplex <= curr->duplex) return curr; return prim; case BOND_PRI_RESELECT_FAILURE: return curr; default: netdev_err(bond->dev, "impossible primary_reselect %d\n", bond->params.primary_reselect); return curr; } } /** * bond_find_best_slave - select the best available slave to be the active one * @bond: our bonding struct */ static struct slave *bond_find_best_slave(struct bonding *bond) { struct slave *slave, *bestslave = NULL; struct list_head *iter; int mintime = bond->params.updelay; slave = bond_choose_primary_or_current(bond); if (slave) return slave; bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) return slave; if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) && slave->delay < mintime) { mintime = slave->delay; bestslave = slave; } } return bestslave; } /* must be called in RCU critical section or with RTNL held */ static bool bond_should_notify_peers(struct bonding *bond) { struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); if (!slave || !bond->send_peer_notif || bond->send_peer_notif % max(1, bond->params.peer_notif_delay) != 0 || !netif_carrier_ok(bond->dev) || test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) return false; netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n", slave ? slave->dev->name : "NULL"); return true; } /** * bond_change_active_slave - change the active slave into the specified one * @bond: our bonding struct * @new_active: the new slave to make the active one * * Set the new slave to the bond's settings and unset them on the old * curr_active_slave. * Setting include flags, mc-list, promiscuity, allmulti, etc. * * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP, * because it is apparently the best available slave we have, even though its * updelay hasn't timed out yet. * * Caller must hold RTNL. */ void bond_change_active_slave(struct bonding *bond, struct slave *new_active) { struct slave *old_active; ASSERT_RTNL(); old_active = rtnl_dereference(bond->curr_active_slave); if (old_active == new_active) return; #ifdef CONFIG_XFRM_OFFLOAD bond_ipsec_del_sa_all(bond); #endif /* CONFIG_XFRM_OFFLOAD */ if (new_active) { new_active->last_link_up = jiffies; if (new_active->link == BOND_LINK_BACK) { if (bond_uses_primary(bond)) { slave_info(bond->dev, new_active->dev, "making interface the new active one %d ms earlier\n", (bond->params.updelay - new_active->delay) * bond->params.miimon); } new_active->delay = 0; bond_set_slave_link_state(new_active, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_handle_link_change(new_active, BOND_LINK_UP); if (bond_is_lb(bond)) bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP); } else { if (bond_uses_primary(bond)) slave_info(bond->dev, new_active->dev, "making interface the new active one\n"); } } if (bond_uses_primary(bond)) bond_hw_addr_swap(bond, new_active, old_active); if (bond_is_lb(bond)) { bond_alb_handle_active_change(bond, new_active); if (old_active) bond_set_slave_inactive_flags(old_active, BOND_SLAVE_NOTIFY_NOW); if (new_active) bond_set_slave_active_flags(new_active, BOND_SLAVE_NOTIFY_NOW); } else { rcu_assign_pointer(bond->curr_active_slave, new_active); } if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) { if (old_active) bond_set_slave_inactive_flags(old_active, BOND_SLAVE_NOTIFY_NOW); if (new_active) { bool should_notify_peers = false; bond_set_slave_active_flags(new_active, BOND_SLAVE_NOTIFY_NOW); if (bond->params.fail_over_mac) bond_do_fail_over_mac(bond, new_active, old_active); if (netif_running(bond->dev)) { bond->send_peer_notif = bond->params.num_peer_notif * max(1, bond->params.peer_notif_delay); should_notify_peers = bond_should_notify_peers(bond); } call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev); if (should_notify_peers) { bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); } } } #ifdef CONFIG_XFRM_OFFLOAD bond_ipsec_add_sa_all(bond); #endif /* CONFIG_XFRM_OFFLOAD */ /* resend IGMP joins since active slave has changed or * all were sent on curr_active_slave. * resend only if bond is brought up with the affected * bonding modes and the retransmission is enabled */ if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) && ((bond_uses_primary(bond) && new_active) || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) { bond->igmp_retrans = bond->params.resend_igmp; queue_delayed_work(bond->wq, &bond->mcast_work, 1); } } /** * bond_select_active_slave - select a new active slave, if needed * @bond: our bonding struct * * This functions should be called when one of the following occurs: * - The old curr_active_slave has been released or lost its link. * - The primary_slave has got its link back. * - A slave has got its link back and there's no old curr_active_slave. * * Caller must hold RTNL. */ void bond_select_active_slave(struct bonding *bond) { struct slave *best_slave; int rv; ASSERT_RTNL(); best_slave = bond_find_best_slave(bond); if (best_slave != rtnl_dereference(bond->curr_active_slave)) { bond_change_active_slave(bond, best_slave); rv = bond_set_carrier(bond); if (!rv) return; if (netif_carrier_ok(bond->dev)) netdev_info(bond->dev, "active interface up!\n"); else netdev_info(bond->dev, "now running without any active interface!\n"); } } #ifdef CONFIG_NET_POLL_CONTROLLER static inline int slave_enable_netpoll(struct slave *slave) { struct netpoll *np; int err = 0; np = kzalloc(sizeof(*np), GFP_KERNEL); err = -ENOMEM; if (!np) goto out; err = __netpoll_setup(np, slave->dev); if (err) { kfree(np); goto out; } slave->np = np; out: return err; } static inline void slave_disable_netpoll(struct slave *slave) { struct netpoll *np = slave->np; if (!np) return; slave->np = NULL; __netpoll_free(np); } static void bond_poll_controller(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave = NULL; struct list_head *iter; struct ad_info ad_info; if (BOND_MODE(bond) == BOND_MODE_8023AD) if (bond_3ad_get_active_agg_info(bond, &ad_info)) return; bond_for_each_slave_rcu(bond, slave, iter) { if (!bond_slave_is_up(slave)) continue; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg && agg->aggregator_identifier != ad_info.aggregator_id) continue; } netpoll_poll_dev(slave->dev); } } static void bond_netpoll_cleanup(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) if (bond_slave_is_up(slave)) slave_disable_netpoll(slave); } static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; struct slave *slave; int err = 0; bond_for_each_slave(bond, slave, iter) { err = slave_enable_netpoll(slave); if (err) { bond_netpoll_cleanup(dev); break; } } return err; } #else static inline int slave_enable_netpoll(struct slave *slave) { return 0; } static inline void slave_disable_netpoll(struct slave *slave) { } static void bond_netpoll_cleanup(struct net_device *bond_dev) { } #endif /*---------------------------------- IOCTL ----------------------------------*/ static netdev_features_t bond_fix_features(struct net_device *dev, netdev_features_t features) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; netdev_features_t mask; struct slave *slave; mask = features; features &= ~NETIF_F_ONE_FOR_ALL; features |= NETIF_F_ALL_FOR_ALL; bond_for_each_slave(bond, slave, iter) { features = netdev_increment_features(features, slave->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } #define BOND_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ NETIF_F_HIGHDMA | NETIF_F_LRO) #define BOND_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE) #define BOND_MPLS_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_GSO_SOFTWARE) static void bond_compute_features(struct bonding *bond) { unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; #ifdef CONFIG_XFRM_OFFLOAD netdev_features_t xfrm_features = BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ netdev_features_t mpls_features = BOND_MPLS_FEATURES; struct net_device *bond_dev = bond->dev; struct list_head *iter; struct slave *slave; unsigned short max_hard_header_len = ETH_HLEN; unsigned int tso_max_size = TSO_MAX_SIZE; u16 tso_max_segs = TSO_MAX_SEGS; if (!bond_has_slaves(bond)) goto done; vlan_features &= NETIF_F_ALL_FOR_ALL; mpls_features &= NETIF_F_ALL_FOR_ALL; bond_for_each_slave(bond, slave, iter) { vlan_features = netdev_increment_features(vlan_features, slave->dev->vlan_features, BOND_VLAN_FEATURES); enc_features = netdev_increment_features(enc_features, slave->dev->hw_enc_features, BOND_ENC_FEATURES); #ifdef CONFIG_XFRM_OFFLOAD xfrm_features = netdev_increment_features(xfrm_features, slave->dev->hw_enc_features, BOND_XFRM_FEATURES); #endif /* CONFIG_XFRM_OFFLOAD */ mpls_features = netdev_increment_features(mpls_features, slave->dev->mpls_features, BOND_MPLS_FEATURES); dst_release_flag &= slave->dev->priv_flags; if (slave->dev->hard_header_len > max_hard_header_len) max_hard_header_len = slave->dev->hard_header_len; tso_max_size = min(tso_max_size, slave->dev->tso_max_size); tso_max_segs = min(tso_max_segs, slave->dev->tso_max_segs); } bond_dev->hard_header_len = max_hard_header_len; done: bond_dev->vlan_features = vlan_features; bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_enc_features |= xfrm_features; #endif /* CONFIG_XFRM_OFFLOAD */ bond_dev->mpls_features = mpls_features; netif_set_tso_max_segs(bond_dev, tso_max_segs); netif_set_tso_max_size(bond_dev, tso_max_size); bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(bond_dev); } static void bond_setup_by_slave(struct net_device *bond_dev, struct net_device *slave_dev) { bool was_up = !!(bond_dev->flags & IFF_UP); dev_close(bond_dev); bond_dev->header_ops = slave_dev->header_ops; bond_dev->type = slave_dev->type; bond_dev->hard_header_len = slave_dev->hard_header_len; bond_dev->needed_headroom = slave_dev->needed_headroom; bond_dev->addr_len = slave_dev->addr_len; memcpy(bond_dev->broadcast, slave_dev->broadcast, slave_dev->addr_len); if (slave_dev->flags & IFF_POINTOPOINT) { bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } if (was_up) dev_open(bond_dev, NULL); } /* On bonding slaves other than the currently active slave, suppress * duplicates except for alb non-mcast/bcast. */ static bool bond_should_deliver_exact_match(struct sk_buff *skb, struct slave *slave, struct bonding *bond) { if (bond_is_slave_inactive(slave)) { if (BOND_MODE(bond) == BOND_MODE_ALB && skb->pkt_type != PACKET_BROADCAST && skb->pkt_type != PACKET_MULTICAST) return false; return true; } return false; } static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct slave *slave; struct bonding *bond; int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); int ret = RX_HANDLER_ANOTHER; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return RX_HANDLER_CONSUMED; *pskb = skb; slave = bond_slave_get_rcu(skb->dev); bond = slave->bond; recv_probe = READ_ONCE(bond->recv_probe); if (recv_probe) { ret = recv_probe(skb, bond, slave); if (ret == RX_HANDLER_CONSUMED) { consume_skb(skb); return ret; } } /* * For packets determined by bond_should_deliver_exact_match() call to * be suppressed we want to make an exception for link-local packets. * This is necessary for e.g. LLDP daemons to be able to monitor * inactive slave links without being forced to bind to them * explicitly. * * At the same time, packets that are passed to the bonding master * (including link-local ones) can have their originating interface * determined via PACKET_ORIGDEV socket option. */ if (bond_should_deliver_exact_match(skb, slave, bond)) { if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) return RX_HANDLER_PASS; return RX_HANDLER_EXACT; } skb->dev = bond->dev; if (BOND_MODE(bond) == BOND_MODE_ALB && netif_is_bridge_port(bond->dev) && skb->pkt_type == PACKET_HOST) { if (unlikely(skb_cow_head(skb, skb->data - skb_mac_header(skb)))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr, bond->dev->addr_len); } return ret; } static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return NETDEV_LAG_TX_TYPE_ROUNDROBIN; case BOND_MODE_ACTIVEBACKUP: return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; case BOND_MODE_BROADCAST: return NETDEV_LAG_TX_TYPE_BROADCAST; case BOND_MODE_XOR: case BOND_MODE_8023AD: return NETDEV_LAG_TX_TYPE_HASH; default: return NETDEV_LAG_TX_TYPE_UNKNOWN; } } static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond, enum netdev_lag_tx_type type) { if (type != NETDEV_LAG_TX_TYPE_HASH) return NETDEV_LAG_HASH_NONE; switch (bond->params.xmit_policy) { case BOND_XMIT_POLICY_LAYER2: return NETDEV_LAG_HASH_L2; case BOND_XMIT_POLICY_LAYER34: return NETDEV_LAG_HASH_L34; case BOND_XMIT_POLICY_LAYER23: return NETDEV_LAG_HASH_L23; case BOND_XMIT_POLICY_ENCAP23: return NETDEV_LAG_HASH_E23; case BOND_XMIT_POLICY_ENCAP34: return NETDEV_LAG_HASH_E34; case BOND_XMIT_POLICY_VLAN_SRCMAC: return NETDEV_LAG_HASH_VLAN_SRCMAC; default: return NETDEV_LAG_HASH_UNKNOWN; } } static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; enum netdev_lag_tx_type type; int err; type = bond_lag_tx_type(bond); lag_upper_info.tx_type = type; lag_upper_info.hash_type = bond_lag_hash_type(bond, type); err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave, &lag_upper_info, extack); if (err) return err; slave->dev->flags |= IFF_SLAVE; return 0; } static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave) { netdev_upper_dev_unlink(slave->dev, bond->dev); slave->dev->flags &= ~IFF_SLAVE; } static void slave_kobj_release(struct kobject *kobj) { struct slave *slave = to_slave(kobj); struct bonding *bond = bond_get_bond_by_slave(slave); cancel_delayed_work_sync(&slave->notify_work); if (BOND_MODE(bond) == BOND_MODE_8023AD) kfree(SLAVE_AD_INFO(slave)); kfree(slave); } static struct kobj_type slave_ktype = { .release = slave_kobj_release, #ifdef CONFIG_SYSFS .sysfs_ops = &slave_sysfs_ops, #endif }; static int bond_kobj_init(struct slave *slave) { int err; err = kobject_init_and_add(&slave->kobj, &slave_ktype, &(slave->dev->dev.kobj), "bonding_slave"); if (err) kobject_put(&slave->kobj); return err; } static struct slave *bond_alloc_slave(struct bonding *bond, struct net_device *slave_dev) { struct slave *slave = NULL; slave = kzalloc(sizeof(*slave), GFP_KERNEL); if (!slave) return NULL; slave->bond = bond; slave->dev = slave_dev; INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work); if (bond_kobj_init(slave)) return NULL; if (BOND_MODE(bond) == BOND_MODE_8023AD) { SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info), GFP_KERNEL); if (!SLAVE_AD_INFO(slave)) { kobject_put(&slave->kobj); return NULL; } } return slave; } static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info) { info->bond_mode = BOND_MODE(bond); info->miimon = bond->params.miimon; info->num_slaves = bond->slave_cnt; } static void bond_fill_ifslave(struct slave *slave, struct ifslave *info) { strcpy(info->slave_name, slave->dev->name); info->link = slave->link; info->state = bond_slave_state(slave); info->link_failure_count = slave->link_failure_count; } static void bond_netdev_notify_work(struct work_struct *_work) { struct slave *slave = container_of(_work, struct slave, notify_work.work); if (rtnl_trylock()) { struct netdev_bonding_info binfo; bond_fill_ifslave(slave, &binfo.slave); bond_fill_ifbond(slave->bond, &binfo.master); netdev_bonding_info_change(slave->dev, &binfo); rtnl_unlock(); } else { queue_delayed_work(slave->bond->wq, &slave->notify_work, 1); } } void bond_queue_slave_event(struct slave *slave) { queue_delayed_work(slave->bond->wq, &slave->notify_work, 0); } void bond_lower_state_changed(struct slave *slave) { struct netdev_lag_lower_state_info info; info.link_up = slave->link == BOND_LINK_UP || slave->link == BOND_LINK_FAIL; info.tx_enabled = bond_is_active_slave(slave); netdev_lower_state_changed(slave->dev, &info); } #define BOND_NL_ERR(bond_dev, extack, errmsg) do { \ if (extack) \ NL_SET_ERR_MSG(extack, errmsg); \ else \ netdev_err(bond_dev, "Error: %s\n", errmsg); \ } while (0) #define SLAVE_NL_ERR(bond_dev, slave_dev, extack, errmsg) do { \ if (extack) \ NL_SET_ERR_MSG(extack, errmsg); \ else \ slave_err(bond_dev, slave_dev, "Error: %s\n", errmsg); \ } while (0) /* The bonding driver uses ether_setup() to convert a master bond device * to ARPHRD_ETHER, that resets the target netdevice's flags so we always * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP * if they were set */ static void bond_ether_setup(struct net_device *bond_dev) { unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP); ether_setup(bond_dev); bond_dev->flags |= IFF_MASTER | flags; bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; } void bond_xdp_set_features(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); xdp_features_t val = NETDEV_XDP_ACT_MASK; struct list_head *iter; struct slave *slave; ASSERT_RTNL(); if (!bond_xdp_check(bond) || !bond_has_slaves(bond)) { xdp_clear_features_flag(bond_dev); return; } bond_for_each_slave(bond, slave, iter) val &= slave->dev->xdp_features; val &= ~NETDEV_XDP_ACT_XSK_ZEROCOPY; xdp_set_features_flag(bond_dev, val); } /* enslave device <slave> to bond device <master> */ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); const struct net_device_ops *slave_ops = slave_dev->netdev_ops; struct slave *new_slave = NULL, *prev_slave; struct sockaddr_storage ss; int link_reporting; int res = 0, i; if (slave_dev->flags & IFF_MASTER && !netif_is_bond_master(slave_dev)) { BOND_NL_ERR(bond_dev, extack, "Device type (master device) cannot be enslaved"); return -EPERM; } if (!bond->params.use_carrier && slave_dev->ethtool_ops->get_link == NULL && slave_ops->ndo_eth_ioctl == NULL) { slave_warn(bond_dev, slave_dev, "no link monitoring support\n"); } /* already in-use? */ if (netdev_is_rx_handler_busy(slave_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device is in use and cannot be enslaved"); return -EBUSY; } if (bond_dev == slave_dev) { BOND_NL_ERR(bond_dev, extack, "Cannot enslave bond to itself."); return -EPERM; } /* vlan challenged mutual exclusion */ /* no need to lock since we're protected by rtnl_lock */ if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) { slave_dbg(bond_dev, slave_dev, "is NETIF_F_VLAN_CHALLENGED\n"); if (vlan_uses_dev(bond_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Can not enslave VLAN challenged device to VLAN enabled bond"); return -EPERM; } else { slave_warn(bond_dev, slave_dev, "enslaved VLAN challenged slave. Adding VLANs will be blocked as long as it is part of bond.\n"); } } else { slave_dbg(bond_dev, slave_dev, "is !NETIF_F_VLAN_CHALLENGED\n"); } if (slave_dev->features & NETIF_F_HW_ESP) slave_dbg(bond_dev, slave_dev, "is esp-hw-offload capable\n"); /* Old ifenslave binaries are no longer supported. These can * be identified with moderate accuracy by the state of the slave: * the current ifenslave will set the interface down prior to * enslaving it; the old ifenslave will not. */ if (slave_dev->flags & IFF_UP) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device can not be enslaved while up"); return -EPERM; } /* set bonding device ether type by slave - bonding netdevices are * created with ether_setup, so when the slave type is not ARPHRD_ETHER * there is a need to override some of the type dependent attribs/funcs. * * bond ether type mutual exclusion - don't allow slaves of dissimilar * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond */ if (!bond_has_slaves(bond)) { if (bond_dev->type != slave_dev->type) { slave_dbg(bond_dev, slave_dev, "change device type from %d to %d\n", bond_dev->type, slave_dev->type); res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, bond_dev); res = notifier_to_errno(res); if (res) { slave_err(bond_dev, slave_dev, "refused to change device type\n"); return -EBUSY; } /* Flush unicast and multicast addresses */ dev_uc_flush(bond_dev); dev_mc_flush(bond_dev); if (slave_dev->type != ARPHRD_ETHER) bond_setup_by_slave(bond_dev, slave_dev); else bond_ether_setup(bond_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, bond_dev); } } else if (bond_dev->type != slave_dev->type) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device type is different from other slaves"); return -EINVAL; } if (slave_dev->type == ARPHRD_INFINIBAND && BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Only active-backup mode is supported for infiniband slaves"); res = -EOPNOTSUPP; goto err_undo_flags; } if (!slave_ops->ndo_set_mac_address || slave_dev->type == ARPHRD_INFINIBAND) { slave_warn(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address\n"); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac != BOND_FOM_ACTIVE) { if (!bond_has_slaves(bond)) { bond->params.fail_over_mac = BOND_FOM_ACTIVE; slave_warn(bond_dev, slave_dev, "Setting fail_over_mac to active for active-backup mode\n"); } else { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave device does not support setting the MAC address, but fail_over_mac is not set to active"); res = -EOPNOTSUPP; goto err_undo_flags; } } } call_netdevice_notifiers(NETDEV_JOIN, slave_dev); /* If this is the first slave, then we need to set the master's hardware * address to be the same as the slave's. */ if (!bond_has_slaves(bond) && bond->dev->addr_assign_type == NET_ADDR_RANDOM) { res = bond_set_dev_addr(bond->dev, slave_dev); if (res) goto err_undo_flags; } new_slave = bond_alloc_slave(bond, slave_dev); if (!new_slave) { res = -ENOMEM; goto err_undo_flags; } /* Set the new_slave's queue_id to be zero. Queue ID mapping * is set via sysfs or module option if desired. */ new_slave->queue_id = 0; /* Save slave's original mtu and then set it to match the bond */ new_slave->original_mtu = slave_dev->mtu; res = dev_set_mtu(slave_dev, bond->dev->mtu); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling dev_set_mtu\n", res); goto err_free; } /* Save slave's original ("permanent") mac address for modes * that need it, and for restoring it upon release, and then * set it to the master's address */ bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr, slave_dev->addr_len); if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* Set slave to master's mac address. The application already * set the master's mac address to that of the first slave */ memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); ss.ss_family = slave_dev->type; res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, extack); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res); goto err_restore_mtu; } } /* set no_addrconf flag before open to prevent IPv6 addrconf */ slave_dev->priv_flags |= IFF_NO_ADDRCONF; /* open the slave since the application closed it */ res = dev_open(slave_dev, extack); if (res) { slave_err(bond_dev, slave_dev, "Opening slave failed\n"); goto err_restore_mac; } slave_dev->priv_flags |= IFF_BONDING; /* initialize slave stats */ dev_get_stats(new_slave->dev, &new_slave->slave_stats); if (bond_is_lb(bond)) { /* bond_alb_init_slave() must be called before all other stages since * it might fail and we do not want to have to undo everything */ res = bond_alb_init_slave(bond, new_slave); if (res) goto err_close; } res = vlan_vids_add_by_dev(slave_dev, bond_dev); if (res) { slave_err(bond_dev, slave_dev, "Couldn't add bond vlan ids\n"); goto err_close; } prev_slave = bond_last_slave(bond); new_slave->delay = 0; new_slave->link_failure_count = 0; if (bond_update_speed_duplex(new_slave) && bond_needs_speed_duplex(bond)) new_slave->link = BOND_LINK_DOWN; new_slave->last_rx = jiffies - (msecs_to_jiffies(bond->params.arp_interval) + 1); for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) new_slave->target_last_arp_rx[i] = new_slave->last_rx; new_slave->last_tx = new_slave->last_rx; if (bond->params.miimon && !bond->params.use_carrier) { link_reporting = bond_check_dev_link(bond, slave_dev, 1); if ((link_reporting == -1) && !bond->params.arp_interval) { /* miimon is set but a bonded network driver * does not support ETHTOOL/MII and * arp_interval is not set. Note: if * use_carrier is enabled, we will never go * here (because netif_carrier is always * supported); thus, we don't need to change * the messages for netif_carrier. */ slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n"); } else if (link_reporting == -1) { /* unable get link status using mii/ethtool */ slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n"); } } /* check for initial state */ new_slave->link = BOND_LINK_NOCHANGE; if (bond->params.miimon) { if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) { if (bond->params.updelay) { bond_set_slave_link_state(new_slave, BOND_LINK_BACK, BOND_SLAVE_NOTIFY_NOW); new_slave->delay = bond->params.updelay; } else { bond_set_slave_link_state(new_slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); } } else { bond_set_slave_link_state(new_slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); } } else if (bond->params.arp_interval) { bond_set_slave_link_state(new_slave, (netif_carrier_ok(slave_dev) ? BOND_LINK_UP : BOND_LINK_DOWN), BOND_SLAVE_NOTIFY_NOW); } else { bond_set_slave_link_state(new_slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); } if (new_slave->link != BOND_LINK_DOWN) new_slave->last_link_up = jiffies; slave_dbg(bond_dev, slave_dev, "Initial state of slave is BOND_LINK_%s\n", new_slave->link == BOND_LINK_DOWN ? "DOWN" : (new_slave->link == BOND_LINK_UP ? "UP" : "BACK")); if (bond_uses_primary(bond) && bond->params.primary[0]) { /* if there is a primary slave, remember it */ if (strcmp(bond->params.primary, new_slave->dev->name) == 0) { rcu_assign_pointer(bond->primary_slave, new_slave); bond->force_primary = true; } } switch (BOND_MODE(bond)) { case BOND_MODE_ACTIVEBACKUP: bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); break; case BOND_MODE_8023AD: /* in 802.3ad mode, the internal mechanism * will activate the slaves in the selected * aggregator */ bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); /* if this is the first slave */ if (!prev_slave) { SLAVE_AD_INFO(new_slave)->id = 1; /* Initialize AD with the number of times that the AD timer is called in 1 second * can be called only after the mac address of the bond is set */ bond_3ad_initialize(bond); } else { SLAVE_AD_INFO(new_slave)->id = SLAVE_AD_INFO(prev_slave)->id + 1; } bond_3ad_bind_slave(new_slave); break; case BOND_MODE_TLB: case BOND_MODE_ALB: bond_set_active_slave(new_slave); bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); break; default: slave_dbg(bond_dev, slave_dev, "This slave is always active in trunk mode\n"); /* always active in trunk mode */ bond_set_active_slave(new_slave); /* In trunking mode there is little meaning to curr_active_slave * anyway (it holds no special properties of the bond device), * so we can change it without calling change_active_interface() */ if (!rcu_access_pointer(bond->curr_active_slave) && new_slave->link == BOND_LINK_UP) rcu_assign_pointer(bond->curr_active_slave, new_slave); break; } /* switch(bond_mode) */ #ifdef CONFIG_NET_POLL_CONTROLLER if (bond->dev->npinfo) { if (slave_enable_netpoll(new_slave)) { slave_info(bond_dev, slave_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n"); res = -EBUSY; goto err_detach; } } #endif if (!(bond_dev->features & NETIF_F_LRO)) dev_disable_lro(slave_dev); res = netdev_rx_handler_register(slave_dev, bond_handle_frame, new_slave); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling netdev_rx_handler_register\n", res); goto err_detach; } res = bond_master_upper_dev_link(bond, new_slave, extack); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling bond_master_upper_dev_link\n", res); goto err_unregister; } bond_lower_state_changed(new_slave); res = bond_sysfs_slave_add(new_slave); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling bond_sysfs_slave_add\n", res); goto err_upper_unlink; } /* If the mode uses primary, then the following is handled by * bond_change_active_slave(). */ if (!bond_uses_primary(bond)) { /* set promiscuity level to new slave */ if (bond_dev->flags & IFF_PROMISC) { res = dev_set_promiscuity(slave_dev, 1); if (res) goto err_sysfs_del; } /* set allmulti level to new slave */ if (bond_dev->flags & IFF_ALLMULTI) { res = dev_set_allmulti(slave_dev, 1); if (res) { if (bond_dev->flags & IFF_PROMISC) dev_set_promiscuity(slave_dev, -1); goto err_sysfs_del; } } if (bond_dev->flags & IFF_UP) { netif_addr_lock_bh(bond_dev); dev_mc_sync_multiple(slave_dev, bond_dev); dev_uc_sync_multiple(slave_dev, bond_dev); netif_addr_unlock_bh(bond_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) dev_mc_add(slave_dev, lacpdu_mcast_addr); } } bond->slave_cnt++; bond_compute_features(bond); bond_set_carrier(bond); if (bond_uses_primary(bond)) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { if (bond->xdp_prog) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave does not support XDP"); res = -EOPNOTSUPP; goto err_sysfs_del; } } else if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = bond->xdp_prog, .extack = extack, }; if (dev_xdp_prog_count(slave_dev) > 0) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave has XDP program loaded, please unload before enslaving"); res = -EOPNOTSUPP; goto err_sysfs_del; } res = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); if (res < 0) { /* ndo_bpf() sets extack error message */ slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res); goto err_sysfs_del; } if (bond->xdp_prog) bpf_prog_inc(bond->xdp_prog); } bond_xdp_set_features(bond_dev); slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n", bond_is_active_slave(new_slave) ? "an active" : "a backup", new_slave->link != BOND_LINK_DOWN ? "an up" : "a down"); /* enslave is successful */ bond_queue_slave_event(new_slave); return 0; /* Undo stages on error */ err_sysfs_del: bond_sysfs_slave_del(new_slave); err_upper_unlink: bond_upper_dev_unlink(bond, new_slave); err_unregister: netdev_rx_handler_unregister(slave_dev); err_detach: vlan_vids_del_by_dev(slave_dev, bond_dev); if (rcu_access_pointer(bond->primary_slave) == new_slave) RCU_INIT_POINTER(bond->primary_slave, NULL); if (rcu_access_pointer(bond->curr_active_slave) == new_slave) { block_netpoll_tx(); bond_change_active_slave(bond, NULL); bond_select_active_slave(bond); unblock_netpoll_tx(); } /* either primary_slave or curr_active_slave might've changed */ synchronize_rcu(); slave_disable_netpoll(new_slave); err_close: if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; dev_close(slave_dev); err_restore_mac: slave_dev->priv_flags &= ~IFF_NO_ADDRCONF; if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* XXX TODO - fom follow mode needs to change master's * MAC if this slave's MAC is in use by the bond, or at * least print a warning. */ bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr, new_slave->dev->addr_len); ss.ss_family = slave_dev->type; dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL); } err_restore_mtu: dev_set_mtu(slave_dev, new_slave->original_mtu); err_free: kobject_put(&new_slave->kobj); err_undo_flags: /* Enslave of first slave has failed and we need to fix master's mac */ if (!bond_has_slaves(bond)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave_dev->dev_addr)) eth_hw_addr_random(bond_dev); if (bond_dev->type != ARPHRD_ETHER) { dev_close(bond_dev); bond_ether_setup(bond_dev); } } return res; } /* Try to release the slave device <slave> from the bond device <master> * It is legal to access curr_active_slave without a lock because all the function * is RTNL-locked. If "all" is true it means that the function is being called * while destroying a bond interface and all slaves are being released. * * The rules for slave state should be: * for Active/Backup: * Active stays on all backups go down * for Bonded connections: * The first up interface should be left on and all others downed. */ static int __bond_release_one(struct net_device *bond_dev, struct net_device *slave_dev, bool all, bool unregister) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *oldcurrent; struct sockaddr_storage ss; int old_flags = bond_dev->flags; netdev_features_t old_features = bond_dev->features; /* slave is not a slave or master is not master of this slave */ if (!(slave_dev->flags & IFF_SLAVE) || !netdev_has_upper_dev(slave_dev, bond_dev)) { slave_dbg(bond_dev, slave_dev, "cannot release slave\n"); return -EINVAL; } block_netpoll_tx(); slave = bond_get_slave_by_dev(bond, slave_dev); if (!slave) { /* not a slave of this bond */ slave_info(bond_dev, slave_dev, "interface not enslaved\n"); unblock_netpoll_tx(); return -EINVAL; } bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); bond_sysfs_slave_del(slave); /* recompute stats just before removing the slave */ bond_get_stats(bond->dev, &bond->bond_stats); if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = NULL, .extack = NULL, }; if (slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp)) slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n"); } /* unregister rx_handler early so bond_handle_frame wouldn't be called * for this slave anymore. */ netdev_rx_handler_unregister(slave_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_unbind_slave(slave); bond_upper_dev_unlink(bond, slave); if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, slave); slave_info(bond_dev, slave_dev, "Releasing %s interface\n", bond_is_active_slave(slave) ? "active" : "backup"); oldcurrent = rcu_access_pointer(bond->curr_active_slave); RCU_INIT_POINTER(bond->current_arp_slave, NULL); if (!all && (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) && bond_has_slaves(bond)) slave_warn(bond_dev, slave_dev, "the permanent HWaddr of slave - %pM - is still in use by bond - set the HWaddr of slave to a different address to avoid conflicts\n", slave->perm_hwaddr); } if (rtnl_dereference(bond->primary_slave) == slave) RCU_INIT_POINTER(bond->primary_slave, NULL); if (oldcurrent == slave) bond_change_active_slave(bond, NULL); if (bond_is_lb(bond)) { /* Must be called only after the slave has been * detached from the list and the curr_active_slave * has been cleared (if our_slave == old_current), * but before a new active slave is selected. */ bond_alb_deinit_slave(bond, slave); } if (all) { RCU_INIT_POINTER(bond->curr_active_slave, NULL); } else if (oldcurrent == slave) { /* Note that we hold RTNL over this sequence, so there * is no concern that another slave add/remove event * will interfere. */ bond_select_active_slave(bond); } bond_set_carrier(bond); if (!bond_has_slaves(bond)) eth_hw_addr_random(bond_dev); unblock_netpoll_tx(); synchronize_rcu(); bond->slave_cnt--; if (!bond_has_slaves(bond)) { call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev); call_netdevice_notifiers(NETDEV_RELEASE, bond->dev); } bond_compute_features(bond); if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) && (old_features & NETIF_F_VLAN_CHALLENGED)) slave_info(bond_dev, slave_dev, "last VLAN challenged slave left bond - VLAN blocking is removed\n"); vlan_vids_del_by_dev(slave_dev, bond_dev); /* If the mode uses primary, then this case was handled above by * bond_change_active_slave(..., NULL) */ if (!bond_uses_primary(bond)) { /* unset promiscuity level from slave * NOTE: The NETDEV_CHANGEADDR call above may change the value * of the IFF_PROMISC flag in the bond_dev, but we need the * value of that flag before that change, as that was the value * when this slave was attached, so we cache at the start of the * function and use it here. Same goes for ALLMULTI below */ if (old_flags & IFF_PROMISC) dev_set_promiscuity(slave_dev, -1); /* unset allmulti level from slave */ if (old_flags & IFF_ALLMULTI) dev_set_allmulti(slave_dev, -1); if (old_flags & IFF_UP) bond_hw_addr_flush(bond_dev, slave_dev); } slave_disable_netpoll(slave); /* close slave before restoring its mac address */ dev_close(slave_dev); slave_dev->priv_flags &= ~IFF_NO_ADDRCONF; if (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* restore original ("permanent") mac address */ bond_hw_addr_copy(ss.__data, slave->perm_hwaddr, slave->dev->addr_len); ss.ss_family = slave_dev->type; dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL); } if (unregister) __dev_set_mtu(slave_dev, slave->original_mtu); else dev_set_mtu(slave_dev, slave->original_mtu); if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; bond_xdp_set_features(bond_dev); kobject_put(&slave->kobj); return 0; } /* A wrapper used because of ndo_del_link */ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev) { return __bond_release_one(bond_dev, slave_dev, false, false); } /* First release a slave and then destroy the bond if no more slaves are left. * Must be under rtnl_lock when this function is called. */ static int bond_release_and_destroy(struct net_device *bond_dev, struct net_device *slave_dev) { struct bonding *bond = netdev_priv(bond_dev); int ret; ret = __bond_release_one(bond_dev, slave_dev, false, true); if (ret == 0 && !bond_has_slaves(bond) && bond_dev->reg_state != NETREG_UNREGISTERING) { bond_dev->priv_flags |= IFF_DISABLE_NETPOLL; netdev_info(bond_dev, "Destroying bond\n"); bond_remove_proc_entry(bond); unregister_netdevice(bond_dev); } return ret; } static void bond_info_query(struct net_device *bond_dev, struct ifbond *info) { struct bonding *bond = netdev_priv(bond_dev); bond_fill_ifbond(bond, info); } static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; int i = 0, res = -ENODEV; struct slave *slave; bond_for_each_slave(bond, slave, iter) { if (i++ == (int)info->slave_id) { res = 0; bond_fill_ifslave(slave, info); break; } } return res; } /*-------------------------------- Monitoring -------------------------------*/ /* called with rcu_read_lock() */ static int bond_miimon_inspect(struct bonding *bond) { bool ignore_updelay = false; int link_state, commit = 0; struct list_head *iter; struct slave *slave; if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) { ignore_updelay = !rcu_dereference(bond->curr_active_slave); } else { struct bond_up_slave *usable_slaves; usable_slaves = rcu_dereference(bond->usable_slaves); if (usable_slaves && usable_slaves->count == 0) ignore_updelay = true; } bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); link_state = bond_check_dev_link(bond, slave->dev, 0); switch (slave->link) { case BOND_LINK_UP: if (link_state) continue; bond_propose_link_state(slave, BOND_LINK_FAIL); commit++; slave->delay = bond->params.downdelay; if (slave->delay && net_ratelimit()) { slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n", (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) ? (bond_is_active_slave(slave) ? "active " : "backup ") : "", bond->params.downdelay * bond->params.miimon); } fallthrough; case BOND_LINK_FAIL: if (link_state) { /* recovered before downdelay expired */ bond_propose_link_state(slave, BOND_LINK_UP); slave->last_link_up = jiffies; if (net_ratelimit()) slave_info(bond->dev, slave->dev, "link status up again after %d ms\n", (bond->params.downdelay - slave->delay) * bond->params.miimon); commit++; continue; } if (slave->delay <= 0) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; continue; } slave->delay--; break; case BOND_LINK_DOWN: if (!link_state) continue; bond_propose_link_state(slave, BOND_LINK_BACK); commit++; slave->delay = bond->params.updelay; if (slave->delay && net_ratelimit()) { slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n", ignore_updelay ? 0 : bond->params.updelay * bond->params.miimon); } fallthrough; case BOND_LINK_BACK: if (!link_state) { bond_propose_link_state(slave, BOND_LINK_DOWN); if (net_ratelimit()) slave_info(bond->dev, slave->dev, "link status down again after %d ms\n", (bond->params.updelay - slave->delay) * bond->params.miimon); commit++; continue; } if (ignore_updelay) slave->delay = 0; if (slave->delay <= 0) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; ignore_updelay = false; continue; } slave->delay--; break; } } return commit; } static void bond_miimon_link_change(struct bonding *bond, struct slave *slave, char link) { switch (BOND_MODE(bond)) { case BOND_MODE_8023AD: bond_3ad_handle_link_change(slave, link); break; case BOND_MODE_TLB: case BOND_MODE_ALB: bond_alb_handle_link_change(bond, slave, link); break; case BOND_MODE_XOR: bond_update_slave_arr(bond, NULL); break; } } static void bond_miimon_commit(struct bonding *bond) { struct slave *slave, *primary, *active; bool do_failover = false; struct list_head *iter; ASSERT_RTNL(); bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: /* For 802.3ad mode, check current slave speed and * duplex again in case its port was disabled after * invalid speed/duplex reporting but recovered before * link monitoring could make a decision on the actual * link status */ if (BOND_MODE(bond) == BOND_MODE_8023AD && slave->link == BOND_LINK_UP) bond_3ad_adapter_speed_duplex_changed(slave); continue; case BOND_LINK_UP: if (bond_update_speed_duplex(slave) && bond_needs_speed_duplex(bond)) { slave->link = BOND_LINK_DOWN; if (net_ratelimit()) slave_warn(bond->dev, slave->dev, "failed to get link speed/duplex\n"); continue; } bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); slave->last_link_up = jiffies; primary = rtnl_dereference(bond->primary_slave); if (BOND_MODE(bond) == BOND_MODE_8023AD) { /* prevent it from being the active one */ bond_set_backup_slave(slave); } else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* make it immediately active */ bond_set_active_slave(slave); } slave_info(bond->dev, slave->dev, "link status definitely up, %u Mbps %s duplex\n", slave->speed == SPEED_UNKNOWN ? 0 : slave->speed, slave->duplex ? "full" : "half"); bond_miimon_link_change(bond, slave, BOND_LINK_UP); active = rtnl_dereference(bond->curr_active_slave); if (!active || slave == primary || slave->prio > active->prio) do_failover = true; continue; case BOND_LINK_DOWN: if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_8023AD) bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n"); bond_miimon_link_change(bond, slave, BOND_LINK_DOWN); if (slave == rcu_access_pointer(bond->curr_active_slave)) do_failover = true; continue; default: slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n", slave->link_new_state); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); continue; } } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } bond_set_carrier(bond); } /* bond_mii_monitor * * Really a wrapper that splits the mii monitor into two phases: an * inspection, then (if inspection indicates something needs to be done) * an acquisition of appropriate locks followed by a commit phase to * implement whatever link state changes are indicated. */ static void bond_mii_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, mii_work.work); bool should_notify_peers = false; bool commit; unsigned long delay; struct slave *slave; struct list_head *iter; delay = msecs_to_jiffies(bond->params.miimon); if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); should_notify_peers = bond_should_notify_peers(bond); commit = !!bond_miimon_inspect(bond); if (bond->send_peer_notif) { rcu_read_unlock(); if (rtnl_trylock()) { bond->send_peer_notif--; rtnl_unlock(); } } else { rcu_read_unlock(); } if (commit) { /* Race avoidance with bond_close cancel of workqueue */ if (!rtnl_trylock()) { delay = 1; should_notify_peers = false; goto re_arm; } bond_for_each_slave(bond, slave, iter) { bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER); } bond_miimon_commit(bond); rtnl_unlock(); /* might sleep, hold no other locks */ } re_arm: if (bond->params.miimon) queue_delayed_work(bond->wq, &bond->mii_work, delay); if (should_notify_peers) { if (!rtnl_trylock()) return; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); rtnl_unlock(); } } static int bond_upper_dev_walk(struct net_device *upper, struct netdev_nested_priv *priv) { __be32 ip = *(__be32 *)priv->data; return ip == bond_confirm_addr(upper, 0, ip); } static bool bond_has_this_ip(struct bonding *bond, __be32 ip) { struct netdev_nested_priv priv = { .data = (void *)&ip, }; bool ret = false; if (ip == bond_confirm_addr(bond->dev, 0, ip)) return true; rcu_read_lock(); if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv)) ret = true; rcu_read_unlock(); return ret; } #define BOND_VLAN_PROTO_NONE cpu_to_be16(0xffff) static bool bond_handle_vlan(struct slave *slave, struct bond_vlan_tag *tags, struct sk_buff *skb) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct bond_vlan_tag *outer_tag = tags; if (!tags || tags->vlan_proto == BOND_VLAN_PROTO_NONE) return true; tags++; /* Go through all the tags backwards and add them to the packet */ while (tags->vlan_proto != BOND_VLAN_PROTO_NONE) { if (!tags->vlan_id) { tags++; continue; } slave_dbg(bond_dev, slave_dev, "inner tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), tags->vlan_id); skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto, tags->vlan_id); if (!skb) { net_err_ratelimited("failed to insert inner VLAN tag\n"); return false; } tags++; } /* Set the outer tag */ if (outer_tag->vlan_id) { slave_dbg(bond_dev, slave_dev, "outer tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), outer_tag->vlan_id); __vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto, outer_tag->vlan_id); } return true; } /* We go to the (large) trouble of VLAN tagging ARP frames because * switches in VLAN mode (especially if ports are configured as * "native" to a VLAN) might not pass non-tagged frames. */ static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip, __be32 src_ip, struct bond_vlan_tag *tags) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct sk_buff *skb; slave_dbg(bond_dev, slave_dev, "arp %d on slave: dst %pI4 src %pI4\n", arp_op, &dest_ip, &src_ip); skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip, NULL, slave_dev->dev_addr, NULL); if (!skb) { net_err_ratelimited("ARP packet allocation failed\n"); return; } if (bond_handle_vlan(slave, tags, skb)) { slave_update_last_tx(slave); arp_xmit(skb); } return; } /* Validate the device path between the @start_dev and the @end_dev. * The path is valid if the @end_dev is reachable through device * stacking. * When the path is validated, collect any vlan information in the * path. */ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level) { struct bond_vlan_tag *tags; struct net_device *upper; struct list_head *iter; if (start_dev == end_dev) { tags = kcalloc(level + 1, sizeof(*tags), GFP_ATOMIC); if (!tags) return ERR_PTR(-ENOMEM); tags[level].vlan_proto = BOND_VLAN_PROTO_NONE; return tags; } netdev_for_each_upper_dev_rcu(start_dev, upper, iter) { tags = bond_verify_device_path(upper, end_dev, level + 1); if (IS_ERR_OR_NULL(tags)) { if (IS_ERR(tags)) return tags; continue; } if (is_vlan_dev(upper)) { tags[level].vlan_proto = vlan_dev_vlan_proto(upper); tags[level].vlan_id = vlan_dev_vlan_id(upper); } return tags; } return NULL; } static void bond_arp_send_all(struct bonding *bond, struct slave *slave) { struct rtable *rt; struct bond_vlan_tag *tags; __be32 *targets = bond->params.arp_targets, addr; int i; for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) { slave_dbg(bond->dev, slave->dev, "%s: target %pI4\n", __func__, &targets[i]); tags = NULL; /* Find out through which dev should the packet go */ rt = ip_route_output(dev_net(bond->dev), targets[i], 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) { /* there's no route to target - try to send arp * probe to generate any traffic (arp_validate=0) */ if (bond->params.arp_validate) pr_warn_once("%s: no route to arp_ip_target %pI4 and arp_validate is set\n", bond->dev->name, &targets[i]); bond_arp_send(slave, ARPOP_REQUEST, targets[i], 0, tags); continue; } /* bond device itself */ if (rt->dst.dev == bond->dev) goto found; rcu_read_lock(); tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0); rcu_read_unlock(); if (!IS_ERR_OR_NULL(tags)) goto found; /* Not our device - skip */ slave_dbg(bond->dev, slave->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n", &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL"); ip_rt_put(rt); continue; found: addr = bond_confirm_addr(rt->dst.dev, targets[i], 0); ip_rt_put(rt); bond_arp_send(slave, ARPOP_REQUEST, targets[i], addr, tags); kfree(tags); } } static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip) { int i; if (!sip || !bond_has_this_ip(bond, tip)) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 tip %pI4 not found\n", __func__, &sip, &tip); return; } i = bond_get_targets_ip(bond->params.arp_targets, sip); if (i == -1) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 not found in targets\n", __func__, &sip); return; } slave->last_rx = jiffies; slave->target_last_arp_rx[i] = jiffies; } static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct arphdr *arp = (struct arphdr *)skb->data; struct slave *curr_active_slave, *curr_arp_slave; unsigned char *arp_ptr; __be32 sip, tip; unsigned int alen; alen = arp_hdr_len(bond->dev); if (alen > skb_headlen(skb)) { arp = kmalloc(alen, GFP_ATOMIC); if (!arp) goto out_unlock; if (skb_copy_bits(skb, 0, arp, alen) < 0) goto out_unlock; } if (arp->ar_hln != bond->dev->addr_len || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK || arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_pln != 4) goto out_unlock; arp_ptr = (unsigned char *)(arp + 1); arp_ptr += bond->dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4 + bond->dev->addr_len; memcpy(&tip, arp_ptr, 4); slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI4 tip %pI4\n", __func__, slave->dev->name, bond_slave_state(slave), bond->params.arp_validate, slave_do_arp_validate(bond, slave), &sip, &tip); curr_active_slave = rcu_dereference(bond->curr_active_slave); curr_arp_slave = rcu_dereference(bond->current_arp_slave); /* We 'trust' the received ARP enough to validate it if: * * (a) the slave receiving the ARP is active (which includes the * current ARP slave, if any), or * * (b) the receiving slave isn't active, but there is a currently * active slave and it received valid arp reply(s) after it became * the currently active slave, or * * (c) there is an ARP slave that sent an ARP during the prior ARP * interval, and we receive an ARP reply on any slave. We accept * these because switch FDB update delays may deliver the ARP * reply to a slave other than the sender of the ARP request. * * Note: for (b), backup slaves are receiving the broadcast ARP * request, not a reply. This request passes from the sending * slave through the L2 switch(es) to the receiving slave. Since * this is checking the request, sip/tip are swapped for * validation. * * This is done to avoid endless looping when we can't reach the * arp_ip_target and fool ourselves with our own arp requests. */ if (bond_is_active_slave(slave)) bond_validate_arp(bond, slave, sip, tip); else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) bond_validate_arp(bond, slave, tip, sip); else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_arp(bond, slave, sip, tip); out_unlock: if (arp != (struct arphdr *)skb->data) kfree(arp); return RX_HANDLER_ANOTHER; } #if IS_ENABLED(CONFIG_IPV6) static void bond_ns_send(struct slave *slave, const struct in6_addr *daddr, const struct in6_addr *saddr, struct bond_vlan_tag *tags) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct in6_addr mcaddr; struct sk_buff *skb; slave_dbg(bond_dev, slave_dev, "NS on slave: dst %pI6c src %pI6c\n", daddr, saddr); skb = ndisc_ns_create(slave_dev, daddr, saddr, 0); if (!skb) { net_err_ratelimited("NS packet allocation failed\n"); return; } addrconf_addr_solict_mult(daddr, &mcaddr); if (bond_handle_vlan(slave, tags, skb)) { slave_update_last_tx(slave); ndisc_send_skb(skb, &mcaddr, saddr); } } static void bond_ns_send_all(struct bonding *bond, struct slave *slave) { struct in6_addr *targets = bond->params.ns_targets; struct bond_vlan_tag *tags; struct dst_entry *dst; struct in6_addr saddr; struct flowi6 fl6; int i; for (i = 0; i < BOND_MAX_NS_TARGETS && !ipv6_addr_any(&targets[i]); i++) { slave_dbg(bond->dev, slave->dev, "%s: target %pI6c\n", __func__, &targets[i]); tags = NULL; /* Find out through which dev should the packet go */ memset(&fl6, 0, sizeof(struct flowi6)); fl6.daddr = targets[i]; fl6.flowi6_oif = bond->dev->ifindex; dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6); if (dst->error) { dst_release(dst); /* there's no route to target - try to send arp * probe to generate any traffic (arp_validate=0) */ if (bond->params.arp_validate) pr_warn_once("%s: no route to ns_ip6_target %pI6c and arp_validate is set\n", bond->dev->name, &targets[i]); bond_ns_send(slave, &targets[i], &in6addr_any, tags); continue; } /* bond device itself */ if (dst->dev == bond->dev) goto found; rcu_read_lock(); tags = bond_verify_device_path(bond->dev, dst->dev, 0); rcu_read_unlock(); if (!IS_ERR_OR_NULL(tags)) goto found; /* Not our device - skip */ slave_dbg(bond->dev, slave->dev, "no path to ns_ip6_target %pI6c via dst->dev %s\n", &targets[i], dst->dev ? dst->dev->name : "NULL"); dst_release(dst); continue; found: if (!ipv6_dev_get_saddr(dev_net(dst->dev), dst->dev, &targets[i], 0, &saddr)) bond_ns_send(slave, &targets[i], &saddr, tags); else bond_ns_send(slave, &targets[i], &in6addr_any, tags); dst_release(dst); kfree(tags); } } static int bond_confirm_addr6(struct net_device *dev, struct netdev_nested_priv *priv) { struct in6_addr *addr = (struct in6_addr *)priv->data; return ipv6_chk_addr(dev_net(dev), addr, dev, 0); } static bool bond_has_this_ip6(struct bonding *bond, struct in6_addr *addr) { struct netdev_nested_priv priv = { .data = addr, }; int ret = false; if (bond_confirm_addr6(bond->dev, &priv)) return true; rcu_read_lock(); if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_confirm_addr6, &priv)) ret = true; rcu_read_unlock(); return ret; } static void bond_validate_na(struct bonding *bond, struct slave *slave, struct in6_addr *saddr, struct in6_addr *daddr) { int i; /* Ignore NAs that: * 1. Source address is unspecified address. * 2. Dest address is neither all-nodes multicast address nor * exist on bond interface. */ if (ipv6_addr_any(saddr) || (!ipv6_addr_equal(daddr, &in6addr_linklocal_allnodes) && !bond_has_this_ip6(bond, daddr))) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c tip %pI6c not found\n", __func__, saddr, daddr); return; } i = bond_get_targets_ip6(bond->params.ns_targets, saddr); if (i == -1) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c not found in targets\n", __func__, saddr); return; } slave->last_rx = jiffies; slave->target_last_arp_rx[i] = jiffies; } static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct slave *curr_active_slave, *curr_arp_slave; struct in6_addr *saddr, *daddr; struct { struct ipv6hdr ip6; struct icmp6hdr icmp6; } *combined, _combined; if (skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto out; combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined); if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP || (combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) goto out; saddr = &combined->ip6.saddr; daddr = &combined->ip6.daddr; slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI6c tip %pI6c\n", __func__, slave->dev->name, bond_slave_state(slave), bond->params.arp_validate, slave_do_arp_validate(bond, slave), saddr, daddr); curr_active_slave = rcu_dereference(bond->curr_active_slave); curr_arp_slave = rcu_dereference(bond->current_arp_slave); /* We 'trust' the received ARP enough to validate it if: * see bond_arp_rcv(). */ if (bond_is_active_slave(slave)) bond_validate_na(bond, slave, saddr, daddr); else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) bond_validate_na(bond, slave, daddr, saddr); else if (curr_arp_slave && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_na(bond, slave, saddr, daddr); out: return RX_HANDLER_ANOTHER; } #endif int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { #if IS_ENABLED(CONFIG_IPV6) bool is_ipv6 = skb->protocol == __cpu_to_be16(ETH_P_IPV6); #endif bool is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP); slave_dbg(bond->dev, slave->dev, "%s: skb->dev %s\n", __func__, skb->dev->name); /* Use arp validate logic for both ARP and NS */ if (!slave_do_arp_validate(bond, slave)) { if ((slave_do_arp_validate_only(bond) && is_arp) || #if IS_ENABLED(CONFIG_IPV6) (slave_do_arp_validate_only(bond) && is_ipv6) || #endif !slave_do_arp_validate_only(bond)) slave->last_rx = jiffies; return RX_HANDLER_ANOTHER; } else if (is_arp) { return bond_arp_rcv(skb, bond, slave); #if IS_ENABLED(CONFIG_IPV6) } else if (is_ipv6) { return bond_na_rcv(skb, bond, slave); #endif } else { return RX_HANDLER_ANOTHER; } } static void bond_send_validate(struct bonding *bond, struct slave *slave) { bond_arp_send_all(bond, slave); #if IS_ENABLED(CONFIG_IPV6) bond_ns_send_all(bond, slave); #endif } /* function to verify if we're in the arp_interval timeslice, returns true if * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval + * arp_interval/2) . the arp_interval/2 is needed for really fast networks. */ static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod) { int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); return time_in_range(jiffies, last_act - delta_in_ticks, last_act + mod * delta_in_ticks + delta_in_ticks/2); } /* This function is called regularly to monitor each slave's link * ensuring that traffic is being sent and received when arp monitoring * is used in load-balancing mode. if the adapter has been dormant, then an * arp is transmitted to generate traffic. see activebackup_arp_monitor for * arp monitoring in active backup mode. */ static void bond_loadbalance_arp_mon(struct bonding *bond) { struct slave *slave, *oldcurrent; struct list_head *iter; int do_failover = 0, slave_state_changed = 0; if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); oldcurrent = rcu_dereference(bond->curr_active_slave); /* see if any of the previous devices are up now (i.e. they have * xmt and rcv traffic). the curr_active_slave does not come into * the picture unless it is null. also, slave->last_link_up is not * needed here because we send an arp on each slave and give a slave * as long as it needs to get the tx/rx within the delta. * TODO: what about up/down delay in arp mode? it wasn't here before * so it can wait */ bond_for_each_slave_rcu(bond, slave, iter) { unsigned long last_tx = slave_last_tx(slave); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_tx, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); slave_state_changed = 1; /* primary_slave has no meaning in round-robin * mode. the window of a slave being up and * curr_active_slave being null after enslaving * is closed. */ if (!oldcurrent) { slave_info(bond->dev, slave->dev, "link status definitely up\n"); do_failover = 1; } else { slave_info(bond->dev, slave->dev, "interface is now up\n"); } } } else { /* slave->link == BOND_LINK_UP */ /* not all switches will respond to an arp request * when the source ip is 0, so don't take the link down * if we don't know our ip yet */ if (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) { bond_propose_link_state(slave, BOND_LINK_DOWN); slave_state_changed = 1; if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; slave_info(bond->dev, slave->dev, "interface is now down\n"); if (slave == oldcurrent) do_failover = 1; } } /* note: if switch is in round-robin mode, all links * must tx arp to ensure all links rx an arp - otherwise * links may oscillate or not come up at all; if switch is * in something like xor mode, there is nothing we can * do - all replies will be rx'ed on same link causing slaves * to be unstable during low/no traffic periods */ if (bond_slave_is_up(slave)) bond_send_validate(bond, slave); } rcu_read_unlock(); if (do_failover || slave_state_changed) { if (!rtnl_trylock()) goto re_arm; bond_for_each_slave(bond, slave, iter) { if (slave->link_new_state != BOND_LINK_NOCHANGE) slave->link = slave->link_new_state; } if (slave_state_changed) { bond_slave_state_change(bond); if (BOND_MODE(bond) == BOND_MODE_XOR) bond_update_slave_arr(bond, NULL); } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } rtnl_unlock(); } re_arm: if (bond->params.arp_interval) queue_delayed_work(bond->wq, &bond->arp_work, msecs_to_jiffies(bond->params.arp_interval)); } /* Called to inspect slaves for active-backup mode ARP monitor link state * changes. Sets proposed link state in slaves to specify what action * should take place for the slave. Returns 0 if no changes are found, >0 * if changes to link states must be committed. * * Called with rcu_read_lock held. */ static int bond_ab_arp_inspect(struct bonding *bond) { unsigned long last_tx, last_rx; struct list_head *iter; struct slave *slave; int commit = 0; bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); last_rx = slave_last_rx(bond, slave); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; } else if (slave->link == BOND_LINK_BACK) { bond_propose_link_state(slave, BOND_LINK_FAIL); commit++; } continue; } /* Give slaves 2*delta after being enslaved or made * active. This avoids bouncing, as the last receive * times need a full ARP monitor cycle to be updated. */ if (bond_time_in_interval(bond, slave->last_link_up, 2)) continue; /* Backup slave is down if: * - No current_arp_slave AND * - more than (missed_max+1)*delta since last receive AND * - the bond has an IP address * * Note: a non-null current_arp_slave indicates * the curr_active_slave went down and we are * searching for a new one; under this condition * we only take the curr_active_slave down - this * gives each slave a chance to tx/rx traffic * before being taken out */ if (!bond_is_active_slave(slave) && !rcu_access_pointer(bond->current_arp_slave) && !bond_time_in_interval(bond, last_rx, bond->params.missed_max + 1)) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } /* Active slave is down if: * - more than missed_max*delta since transmitting OR * - (more than missed_max*delta since receive AND * the bond has an IP address) */ last_tx = slave_last_tx(slave); if (bond_is_active_slave(slave) && (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } } return commit; } /* Called to commit link state changes noted by inspection step of * active-backup mode ARP monitor. * * Called with RTNL hold. */ static void bond_ab_arp_commit(struct bonding *bond) { bool do_failover = false; struct list_head *iter; unsigned long last_tx; struct slave *slave; bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: continue; case BOND_LINK_UP: last_tx = slave_last_tx(slave); if (rtnl_dereference(bond->curr_active_slave) != slave || (!rtnl_dereference(bond->curr_active_slave) && bond_time_in_interval(bond, last_tx, 1))) { struct slave *current_arp_slave; current_arp_slave = rtnl_dereference(bond->current_arp_slave); bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); if (current_arp_slave) { bond_set_slave_inactive_flags( current_arp_slave, BOND_SLAVE_NOTIFY_NOW); RCU_INIT_POINTER(bond->current_arp_slave, NULL); } slave_info(bond->dev, slave->dev, "link status definitely up\n"); if (!rtnl_dereference(bond->curr_active_slave) || slave == rtnl_dereference(bond->primary_slave) || slave->prio > rtnl_dereference(bond->curr_active_slave)->prio) do_failover = true; } continue; case BOND_LINK_DOWN: if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n"); if (slave == rtnl_dereference(bond->curr_active_slave)) { RCU_INIT_POINTER(bond->current_arp_slave, NULL); do_failover = true; } continue; case BOND_LINK_FAIL: bond_set_slave_link_state(slave, BOND_LINK_FAIL, BOND_SLAVE_NOTIFY_NOW); bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); /* A slave has just been enslaved and has become * the current active slave. */ if (rtnl_dereference(bond->curr_active_slave)) RCU_INIT_POINTER(bond->current_arp_slave, NULL); continue; default: slave_err(bond->dev, slave->dev, "impossible: link_new_state %d on slave\n", slave->link_new_state); continue; } } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } bond_set_carrier(bond); } /* Send ARP probes for active-backup mode ARP monitor. * * Called with rcu_read_lock held. */ static bool bond_ab_arp_probe(struct bonding *bond) { struct slave *slave, *before = NULL, *new_slave = NULL, *curr_arp_slave = rcu_dereference(bond->current_arp_slave), *curr_active_slave = rcu_dereference(bond->curr_active_slave); struct list_head *iter; bool found = false; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; if (curr_arp_slave && curr_active_slave) netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n", curr_arp_slave->dev->name, curr_active_slave->dev->name); if (curr_active_slave) { bond_send_validate(bond, curr_active_slave); return should_notify_rtnl; } /* if we don't have a curr_active_slave, search for the next available * backup slave from the current_arp_slave and make it the candidate * for becoming the curr_active_slave */ if (!curr_arp_slave) { curr_arp_slave = bond_first_slave_rcu(bond); if (!curr_arp_slave) return should_notify_rtnl; } bond_for_each_slave_rcu(bond, slave, iter) { if (!found && !before && bond_slave_is_up(slave)) before = slave; if (found && !new_slave && bond_slave_is_up(slave)) new_slave = slave; /* if the link state is up at this point, we * mark it down - this can happen if we have * simultaneous link failures and * reselect_active_interface doesn't make this * one the current slave so it is still marked * up when it is actually down */ if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_LATER); if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_LATER); slave_info(bond->dev, slave->dev, "backup interface is now down\n"); } if (slave == curr_arp_slave) found = true; } if (!new_slave && before) new_slave = before; if (!new_slave) goto check_state; bond_set_slave_link_state(new_slave, BOND_LINK_BACK, BOND_SLAVE_NOTIFY_LATER); bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER); bond_send_validate(bond, new_slave); new_slave->last_link_up = jiffies; rcu_assign_pointer(bond->current_arp_slave, new_slave); check_state: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify || slave->should_notify_link) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } return should_notify_rtnl; } static void bond_activebackup_arp_mon(struct bonding *bond) { bool should_notify_peers = false; bool should_notify_rtnl = false; int delta_in_ticks; delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); should_notify_peers = bond_should_notify_peers(bond); if (bond_ab_arp_inspect(bond)) { rcu_read_unlock(); /* Race avoidance with bond_close flush of workqueue */ if (!rtnl_trylock()) { delta_in_ticks = 1; should_notify_peers = false; goto re_arm; } bond_ab_arp_commit(bond); rtnl_unlock(); rcu_read_lock(); } should_notify_rtnl = bond_ab_arp_probe(bond); rcu_read_unlock(); re_arm: if (bond->params.arp_interval) queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks); if (should_notify_peers || should_notify_rtnl) { if (!rtnl_trylock()) return; if (should_notify_peers) { bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); } if (should_notify_rtnl) { bond_slave_state_notify(bond); bond_slave_link_notify(bond); } rtnl_unlock(); } } static void bond_arp_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, arp_work.work); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_activebackup_arp_mon(bond); else bond_loadbalance_arp_mon(bond); } /*-------------------------- netdev event handling --------------------------*/ /* Change device name */ static int bond_event_changename(struct bonding *bond) { bond_remove_proc_entry(bond); bond_create_proc_entry(bond); bond_debug_reregister(bond); return NOTIFY_DONE; } static int bond_master_netdev_event(unsigned long event, struct net_device *bond_dev) { struct bonding *event_bond = netdev_priv(bond_dev); netdev_dbg(bond_dev, "%s called\n", __func__); switch (event) { case NETDEV_CHANGENAME: return bond_event_changename(event_bond); case NETDEV_UNREGISTER: bond_remove_proc_entry(event_bond); #ifdef CONFIG_XFRM_OFFLOAD xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true); #endif /* CONFIG_XFRM_OFFLOAD */ break; case NETDEV_REGISTER: bond_create_proc_entry(event_bond); break; default: break; } return NOTIFY_DONE; } static int bond_slave_netdev_event(unsigned long event, struct net_device *slave_dev) { struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary; struct bonding *bond; struct net_device *bond_dev; /* A netdev event can be generated while enslaving a device * before netdev_rx_handler_register is called in which case * slave will be NULL */ if (!slave) { netdev_dbg(slave_dev, "%s called on NULL slave\n", __func__); return NOTIFY_DONE; } bond_dev = slave->bond->dev; bond = slave->bond; primary = rtnl_dereference(bond->primary_slave); slave_dbg(bond_dev, slave_dev, "%s called\n", __func__); switch (event) { case NETDEV_UNREGISTER: if (bond_dev->type != ARPHRD_ETHER) bond_release_and_destroy(bond_dev, slave_dev); else __bond_release_one(bond_dev, slave_dev, false, true); break; case NETDEV_UP: case NETDEV_CHANGE: /* For 802.3ad mode only: * Getting invalid Speed/Duplex values here will put slave * in weird state. Mark it as link-fail if the link was * previously up or link-down if it hasn't yet come up, and * let link-monitoring (miimon) set it right when correct * speeds/duplex are available. */ if (bond_update_speed_duplex(slave) && BOND_MODE(bond) == BOND_MODE_8023AD) { if (slave->last_link_up) slave->link = BOND_LINK_FAIL; else slave->link = BOND_LINK_DOWN; } if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_adapter_speed_duplex_changed(slave); fallthrough; case NETDEV_DOWN: /* Refresh slave-array if applicable! * If the setup does not use miimon or arpmon (mode-specific!), * then these events will not cause the slave-array to be * refreshed. This will cause xmit to use a slave that is not * usable. Avoid such situation by refeshing the array at these * events. If these (miimon/arpmon) parameters are configured * then array gets refreshed twice and that should be fine! */ if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); break; case NETDEV_CHANGEMTU: /* TODO: Should slaves be allowed to * independently alter their MTU? For * an active-backup bond, slaves need * not be the same type of device, so * MTUs may vary. For other modes, * slaves arguably should have the * same MTUs. To do this, we'd need to * take over the slave's change_mtu * function for the duration of their * servitude. */ break; case NETDEV_CHANGENAME: /* we don't care if we don't have primary set */ if (!bond_uses_primary(bond) || !bond->params.primary[0]) break; if (slave == primary) { /* slave's name changed - he's no longer primary */ RCU_INIT_POINTER(bond->primary_slave, NULL); } else if (!strcmp(slave_dev->name, bond->params.primary)) { /* we have a new primary slave */ rcu_assign_pointer(bond->primary_slave, slave); } else { /* we didn't change primary - exit */ break; } netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n", primary ? slave_dev->name : "none"); block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); break; case NETDEV_FEAT_CHANGE: if (!bond->notifier_ctx) { bond->notifier_ctx = true; bond_compute_features(bond); bond->notifier_ctx = false; } break; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, slave->bond->dev); break; case NETDEV_XDP_FEAT_CHANGE: bond_xdp_set_features(bond_dev); break; default: break; } return NOTIFY_DONE; } /* bond_netdev_event: handle netdev notifier chain events. * * This function receives events for the netdev chain. The caller (an * ioctl handler calling blocking_notifier_call_chain) holds the necessary * locks for us to safely manipulate the slave devices (RTNL lock, * dev_probe_lock). */ static int bond_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); netdev_dbg(event_dev, "%s received %s\n", __func__, netdev_cmd_to_name(event)); if (!(event_dev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; if (event_dev->flags & IFF_MASTER) { int ret; ret = bond_master_netdev_event(event, event_dev); if (ret != NOTIFY_DONE) return ret; } if (event_dev->flags & IFF_SLAVE) return bond_slave_netdev_event(event, event_dev); return NOTIFY_DONE; } static struct notifier_block bond_netdev_notifier = { .notifier_call = bond_netdev_event, }; /*---------------------------- Hashing Policies -----------------------------*/ /* Helper to access data in a packet, with or without a backing skb. * If skb is given the data is linearized if necessary via pskb_may_pull. */ static inline const void *bond_pull_data(struct sk_buff *skb, const void *data, int hlen, int n) { if (likely(n <= hlen)) return data; else if (skb && likely(pskb_may_pull(skb, n))) return skb->data; return NULL; } /* L2 hash helper */ static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen) { struct ethhdr *ep; data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); if (!data) return 0; ep = (struct ethhdr *)(data + mhoff); return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto); } static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data, int hlen, __be16 l2_proto, int *nhoff, int *ip_proto, bool l34) { const struct ipv6hdr *iph6; const struct iphdr *iph; if (l2_proto == htons(ETH_P_IP)) { data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph)); if (!data) return false; iph = (const struct iphdr *)(data + *nhoff); iph_to_flow_copy_v4addrs(fk, iph); *nhoff += iph->ihl << 2; if (!ip_is_fragment(iph)) *ip_proto = iph->protocol; } else if (l2_proto == htons(ETH_P_IPV6)) { data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph6)); if (!data) return false; iph6 = (const struct ipv6hdr *)(data + *nhoff); iph_to_flow_copy_v6addrs(fk, iph6); *nhoff += sizeof(*iph6); *ip_proto = iph6->nexthdr; } else { return false; } if (l34 && *ip_proto >= 0) fk->ports.ports = __skb_flow_get_ports(skb, *nhoff, *ip_proto, data, hlen); return true; } static u32 bond_vlan_srcmac_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen) { u32 srcmac_vendor = 0, srcmac_dev = 0; struct ethhdr *mac_hdr; u16 vlan = 0; int i; data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); if (!data) return 0; mac_hdr = (struct ethhdr *)(data + mhoff); for (i = 0; i < 3; i++) srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i]; for (i = 3; i < ETH_ALEN; i++) srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i]; if (skb && skb_vlan_tag_present(skb)) vlan = skb_vlan_tag_get(skb); return vlan ^ srcmac_vendor ^ srcmac_dev; } /* Extract the appropriate headers based on bond's xmit policy */ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const void *data, __be16 l2_proto, int nhoff, int hlen, struct flow_keys *fk) { bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34; int ip_proto = -1; switch (bond->params.xmit_policy) { case BOND_XMIT_POLICY_ENCAP23: case BOND_XMIT_POLICY_ENCAP34: memset(fk, 0, sizeof(*fk)); return __skb_flow_dissect(NULL, skb, &flow_keys_bonding, fk, data, l2_proto, nhoff, hlen, 0); default: break; } fk->ports.ports = 0; memset(&fk->icmp, 0, sizeof(fk->icmp)); if (!bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34)) return false; /* ICMP error packets contains at least 8 bytes of the header * of the packet which generated the error. Use this information * to correlate ICMP error packets within the same flow which * generated the error. */ if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { skb_flow_get_icmp_tci(skb, &fk->icmp, data, nhoff, hlen); if (ip_proto == IPPROTO_ICMP) { if (!icmp_is_err(fk->icmp.type)) return true; nhoff += sizeof(struct icmphdr); } else if (ip_proto == IPPROTO_ICMPV6) { if (!icmpv6_is_err(fk->icmp.type)) return true; nhoff += sizeof(struct icmp6hdr); } return bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34); } return true; } static u32 bond_ip_hash(u32 hash, struct flow_keys *flow, int xmit_policy) { hash ^= (__force u32)flow_get_u32_dst(flow) ^ (__force u32)flow_get_u32_src(flow); hash ^= (hash >> 16); hash ^= (hash >> 8); /* discard lowest hash bit to deal with the common even ports pattern */ if (xmit_policy == BOND_XMIT_POLICY_LAYER34 || xmit_policy == BOND_XMIT_POLICY_ENCAP34) return hash >> 1; return hash; } /* Generate hash based on xmit policy. If @skb is given it is used to linearize * the data as required, but this function can be used without it if the data is * known to be linear (e.g. with xdp_buff). */ static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, const void *data, __be16 l2_proto, int mhoff, int nhoff, int hlen) { struct flow_keys flow; u32 hash; if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC) return bond_vlan_srcmac_hash(skb, data, mhoff, hlen); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 || !bond_flow_dissect(bond, skb, data, l2_proto, nhoff, hlen, &flow)) return bond_eth_hash(skb, data, mhoff, hlen); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) { hash = bond_eth_hash(skb, data, mhoff, hlen); } else { if (flow.icmp.id) memcpy(&hash, &flow.icmp, sizeof(hash)); else memcpy(&hash, &flow.ports.ports, sizeof(hash)); } return bond_ip_hash(hash, &flow, bond->params.xmit_policy); } /** * bond_xmit_hash - generate a hash value based on the xmit policy * @bond: bonding device * @skb: buffer to use for headers * * This function will extract the necessary headers from the skb buffer and use * them to generate a hash based on the xmit_policy set in the bonding device */ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) { if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 && skb->l4_hash) return skb->hash; return __bond_xmit_hash(bond, skb, skb->data, skb->protocol, 0, skb_network_offset(skb), skb_headlen(skb)); } /** * bond_xmit_hash_xdp - generate a hash value based on the xmit policy * @bond: bonding device * @xdp: buffer to use for headers * * The XDP variant of bond_xmit_hash. */ static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp) { struct ethhdr *eth; if (xdp->data + sizeof(struct ethhdr) > xdp->data_end) return 0; eth = (struct ethhdr *)xdp->data; return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0, sizeof(struct ethhdr), xdp->data_end - xdp->data); } /*-------------------------- Device entry points ----------------------------*/ void bond_work_init_all(struct bonding *bond) { INIT_DELAYED_WORK(&bond->mcast_work, bond_resend_igmp_join_requests_delayed); INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor); INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor); INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor); INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler); INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } static void bond_work_cancel_all(struct bonding *bond) { cancel_delayed_work_sync(&bond->mii_work); cancel_delayed_work_sync(&bond->arp_work); cancel_delayed_work_sync(&bond->alb_work); cancel_delayed_work_sync(&bond->ad_work); cancel_delayed_work_sync(&bond->mcast_work); cancel_delayed_work_sync(&bond->slave_arr_work); } static int bond_open(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN && !bond->rr_tx_counter) { bond->rr_tx_counter = alloc_percpu(u32); if (!bond->rr_tx_counter) return -ENOMEM; } /* reset slave->backup and slave->inactive */ if (bond_has_slaves(bond)) { bond_for_each_slave(bond, slave, iter) { if (bond_uses_primary(bond) && slave != rcu_access_pointer(bond->curr_active_slave)) { bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); } else if (BOND_MODE(bond) != BOND_MODE_8023AD) { bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_NOW); } } } if (bond_is_lb(bond)) { /* bond_alb_initialize must be called before the timer * is started. */ if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB))) return -ENOMEM; if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB) queue_delayed_work(bond->wq, &bond->alb_work, 0); } if (bond->params.miimon) /* link check interval, in milliseconds. */ queue_delayed_work(bond->wq, &bond->mii_work, 0); if (bond->params.arp_interval) { /* arp interval, in milliseconds. */ queue_delayed_work(bond->wq, &bond->arp_work, 0); bond->recv_probe = bond_rcv_validate; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { queue_delayed_work(bond->wq, &bond->ad_work, 0); /* register to receive LACPDUs */ bond->recv_probe = bond_3ad_lacpdu_recv; bond_3ad_initiate_agg_selection(bond, 1); bond_for_each_slave(bond, slave, iter) dev_mc_add(slave->dev, lacpdu_mcast_addr); } if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); return 0; } static int bond_close(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; bond_work_cancel_all(bond); bond->send_peer_notif = 0; if (bond_is_lb(bond)) bond_alb_deinitialize(bond); bond->recv_probe = NULL; if (bond_uses_primary(bond)) { rcu_read_lock(); slave = rcu_dereference(bond->curr_active_slave); if (slave) bond_hw_addr_flush(bond_dev, slave->dev); rcu_read_unlock(); } else { struct list_head *iter; bond_for_each_slave(bond, slave, iter) bond_hw_addr_flush(bond_dev, slave->dev); } return 0; } /* fold stats, assuming all rtnl_link_stats64 fields are u64, but * that some drivers can provide 32bit values only. */ static void bond_fold_stats(struct rtnl_link_stats64 *_res, const struct rtnl_link_stats64 *_new, const struct rtnl_link_stats64 *_old) { const u64 *new = (const u64 *)_new; const u64 *old = (const u64 *)_old; u64 *res = (u64 *)_res; int i; for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) { u64 nv = new[i]; u64 ov = old[i]; s64 delta = nv - ov; /* detects if this particular field is 32bit only */ if (((nv | ov) >> 32) == 0) delta = (s64)(s32)((u32)nv - (u32)ov); /* filter anomalies, some drivers reset their stats * at down/up events. */ if (delta > 0) res[i] += delta; } } #ifdef CONFIG_LOCKDEP static int bond_get_lowest_level_rcu(struct net_device *dev) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int cur = 0, max = 0; now = dev; iter = &dev->adj_list.lower; while (1) { next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; if (max <= cur) max = cur; break; } if (!next) { if (!cur) return max; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return max; } #endif static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats) { struct bonding *bond = netdev_priv(bond_dev); struct rtnl_link_stats64 temp; struct list_head *iter; struct slave *slave; int nest_level = 0; rcu_read_lock(); #ifdef CONFIG_LOCKDEP nest_level = bond_get_lowest_level_rcu(bond_dev); #endif spin_lock_nested(&bond->stats_lock, nest_level); memcpy(stats, &bond->bond_stats, sizeof(*stats)); bond_for_each_slave_rcu(bond, slave, iter) { const struct rtnl_link_stats64 *new = dev_get_stats(slave->dev, &temp); bond_fold_stats(stats, new, &slave->slave_stats); /* save off the slave stats for the next run */ memcpy(&slave->slave_stats, new, sizeof(*new)); } memcpy(&bond->bond_stats, stats, sizeof(*stats)); spin_unlock(&bond->stats_lock); rcu_read_unlock(); } static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) { struct bonding *bond = netdev_priv(bond_dev); struct mii_ioctl_data *mii = NULL; netdev_dbg(bond_dev, "bond_eth_ioctl: cmd=%d\n", cmd); switch (cmd) { case SIOCGMIIPHY: mii = if_mii(ifr); if (!mii) return -EINVAL; mii->phy_id = 0; fallthrough; case SIOCGMIIREG: /* We do this again just in case we were called by SIOCGMIIREG * instead of SIOCGMIIPHY. */ mii = if_mii(ifr); if (!mii) return -EINVAL; if (mii->reg_num == 1) { mii->val_out = 0; if (netif_carrier_ok(bond->dev)) mii->val_out = BMSR_LSTATUS; } break; default: return -EOPNOTSUPP; } return 0; } static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) { struct bonding *bond = netdev_priv(bond_dev); struct net_device *slave_dev = NULL; struct ifbond k_binfo; struct ifbond __user *u_binfo = NULL; struct ifslave k_sinfo; struct ifslave __user *u_sinfo = NULL; struct bond_opt_value newval; struct net *net; int res = 0; netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd); switch (cmd) { case SIOCBONDINFOQUERY: u_binfo = (struct ifbond __user *)ifr->ifr_data; if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond))) return -EFAULT; bond_info_query(bond_dev, &k_binfo); if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond))) return -EFAULT; return 0; case SIOCBONDSLAVEINFOQUERY: u_sinfo = (struct ifslave __user *)ifr->ifr_data; if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave))) return -EFAULT; res = bond_slave_info_query(bond_dev, &k_sinfo); if (res == 0 && copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave))) return -EFAULT; return res; default: break; } net = dev_net(bond_dev); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; slave_dev = __dev_get_by_name(net, ifr->ifr_slave); slave_dbg(bond_dev, slave_dev, "slave_dev=%p:\n", slave_dev); if (!slave_dev) return -ENODEV; switch (cmd) { case SIOCBONDENSLAVE: res = bond_enslave(bond_dev, slave_dev, NULL); break; case SIOCBONDRELEASE: res = bond_release(bond_dev, slave_dev); break; case SIOCBONDSETHWADDR: res = bond_set_dev_addr(bond_dev, slave_dev); break; case SIOCBONDCHANGEACTIVE: bond_opt_initstr(&newval, slave_dev->name); res = __bond_opt_set_notify(bond, BOND_OPT_ACTIVE_SLAVE, &newval); break; default: res = -EOPNOTSUPP; } return res; } static int bond_siocdevprivate(struct net_device *bond_dev, struct ifreq *ifr, void __user *data, int cmd) { struct ifreq ifrdata = { .ifr_data = data }; switch (cmd) { case BOND_INFO_QUERY_OLD: return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDINFOQUERY); case BOND_SLAVE_INFO_QUERY_OLD: return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDSLAVEINFOQUERY); case BOND_ENSLAVE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDENSLAVE); case BOND_RELEASE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDRELEASE); case BOND_SETHWADDR_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDSETHWADDR); case BOND_CHANGE_ACTIVE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDCHANGEACTIVE); } return -EOPNOTSUPP; } static void bond_change_rx_flags(struct net_device *bond_dev, int change) { struct bonding *bond = netdev_priv(bond_dev); if (change & IFF_PROMISC) bond_set_promiscuity(bond, bond_dev->flags & IFF_PROMISC ? 1 : -1); if (change & IFF_ALLMULTI) bond_set_allmulti(bond, bond_dev->flags & IFF_ALLMULTI ? 1 : -1); } static void bond_set_rx_mode(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; rcu_read_lock(); if (bond_uses_primary(bond)) { slave = rcu_dereference(bond->curr_active_slave); if (slave) { dev_uc_sync(slave->dev, bond_dev); dev_mc_sync(slave->dev, bond_dev); } } else { bond_for_each_slave_rcu(bond, slave, iter) { dev_uc_sync_multiple(slave->dev, bond_dev); dev_mc_sync_multiple(slave->dev, bond_dev); } } rcu_read_unlock(); } static int bond_neigh_init(struct neighbour *n) { struct bonding *bond = netdev_priv(n->dev); const struct net_device_ops *slave_ops; struct neigh_parms parms; struct slave *slave; int ret = 0; rcu_read_lock(); slave = bond_first_slave_rcu(bond); if (!slave) goto out; slave_ops = slave->dev->netdev_ops; if (!slave_ops->ndo_neigh_setup) goto out; /* TODO: find another way [1] to implement this. * Passing a zeroed structure is fragile, * but at least we do not pass garbage. * * [1] One way would be that ndo_neigh_setup() never touch * struct neigh_parms, but propagate the new neigh_setup() * back to ___neigh_create() / neigh_parms_alloc() */ memset(&parms, 0, sizeof(parms)); ret = slave_ops->ndo_neigh_setup(slave->dev, &parms); if (ret) goto out; if (parms.neigh_setup) ret = parms.neigh_setup(n); out: rcu_read_unlock(); return ret; } /* The bonding ndo_neigh_setup is called at init time beofre any * slave exists. So we must declare proxy setup function which will * be used at run time to resolve the actual slave neigh param setup. * * It's also called by master devices (such as vlans) to setup their * underlying devices. In that case - do nothing, we're already set up from * our init. */ static int bond_neigh_setup(struct net_device *dev, struct neigh_parms *parms) { /* modify only our neigh_parms */ if (parms->dev == dev) parms->neigh_setup = bond_neigh_init; return 0; } /* Change the MTU of all of a master's slaves to match the master */ static int bond_change_mtu(struct net_device *bond_dev, int new_mtu) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct list_head *iter; int res = 0; netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu); bond_for_each_slave(bond, slave, iter) { slave_dbg(bond_dev, slave->dev, "s %p c_m %p\n", slave, slave->dev->netdev_ops->ndo_change_mtu); res = dev_set_mtu(slave->dev, new_mtu); if (res) { /* If we failed to set the slave's mtu to the new value * we must abort the operation even in ACTIVE_BACKUP * mode, because if we allow the backup slaves to have * different mtu values than the active slave we'll * need to change their mtu when doing a failover. That * means changing their mtu from timer context, which * is probably not a good idea. */ slave_dbg(bond_dev, slave->dev, "err %d setting mtu to %d\n", res, new_mtu); goto unwind; } } WRITE_ONCE(bond_dev->mtu, new_mtu); return 0; unwind: /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { int tmp_res; if (rollback_slave == slave) break; tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu); if (tmp_res) slave_dbg(bond_dev, rollback_slave->dev, "unwind err %d\n", tmp_res); } return res; } /* Change HW address * * Note that many devices must be down to change the HW address, and * downing the master releases all slaves. We can make bonds full of * bonding devices to test this, however. */ static int bond_set_mac_address(struct net_device *bond_dev, void *addr) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct sockaddr_storage *ss = addr, tmp_ss; struct list_head *iter; int res = 0; if (BOND_MODE(bond) == BOND_MODE_ALB) return bond_alb_set_mac_address(bond_dev, addr); netdev_dbg(bond_dev, "%s: bond=%p\n", __func__, bond); /* If fail_over_mac is enabled, do nothing and return success. * Returning an error causes ifenslave to fail. */ if (bond->params.fail_over_mac && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) return 0; if (!is_valid_ether_addr(ss->__data)) return -EADDRNOTAVAIL; bond_for_each_slave(bond, slave, iter) { slave_dbg(bond_dev, slave->dev, "%s: slave=%p\n", __func__, slave); res = dev_set_mac_address(slave->dev, addr, NULL); if (res) { /* TODO: consider downing the slave * and retry ? * User should expect communications * breakage anyway until ARP finish * updating, so... */ slave_dbg(bond_dev, slave->dev, "%s: err %d\n", __func__, res); goto unwind; } } /* success */ dev_addr_set(bond_dev, ss->__data); return 0; unwind: memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len); tmp_ss.ss_family = bond_dev->type; /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { int tmp_res; if (rollback_slave == slave) break; tmp_res = dev_set_mac_address(rollback_slave->dev, (struct sockaddr *)&tmp_ss, NULL); if (tmp_res) { slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n", __func__, tmp_res); } } return res; } /** * bond_get_slave_by_id - get xmit slave with slave_id * @bond: bonding device that is transmitting * @slave_id: slave id up to slave_cnt-1 through which to transmit * * This function tries to get slave with slave_id but in case * it fails, it tries to find the first available slave for transmission. */ static struct slave *bond_get_slave_by_id(struct bonding *bond, int slave_id) { struct list_head *iter; struct slave *slave; int i = slave_id; /* Here we start from the slave with slave_id */ bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) { if (bond_slave_can_tx(slave)) return slave; } } /* Here we start from the first slave up to slave_id */ i = slave_id; bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) break; if (bond_slave_can_tx(slave)) return slave; } /* no slave that can tx has been found */ return NULL; } /** * bond_rr_gen_slave_id - generate slave id based on packets_per_slave * @bond: bonding device to use * * Based on the value of the bonding device's packets_per_slave parameter * this function generates a slave id, which is usually used as the next * slave to transmit through. */ static u32 bond_rr_gen_slave_id(struct bonding *bond) { u32 slave_id; struct reciprocal_value reciprocal_packets_per_slave; int packets_per_slave = bond->params.packets_per_slave; switch (packets_per_slave) { case 0: slave_id = get_random_u32(); break; case 1: slave_id = this_cpu_inc_return(*bond->rr_tx_counter); break; default: reciprocal_packets_per_slave = bond->params.reciprocal_packets_per_slave; slave_id = this_cpu_inc_return(*bond->rr_tx_counter); slave_id = reciprocal_divide(slave_id, reciprocal_packets_per_slave); break; } return slave_id; } static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond, struct sk_buff *skb) { struct slave *slave; int slave_cnt; u32 slave_id; /* Start with the curr_active_slave that joined the bond as the * default for sending IGMP traffic. For failover purposes one * needs to maintain some consistency for the interface that will * send the join/membership reports. The curr_active_slave found * will send all of this type of traffic. */ if (skb->protocol == htons(ETH_P_IP)) { int noff = skb_network_offset(skb); struct iphdr *iph; if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) goto non_igmp; iph = ip_hdr(skb); if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) return slave; return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; return bond_get_slave_by_id(bond, slave_id); } return NULL; } static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond, struct xdp_buff *xdp) { struct slave *slave; int slave_cnt; u32 slave_id; const struct ethhdr *eth; void *data = xdp->data; if (data + sizeof(struct ethhdr) > xdp->data_end) goto non_igmp; eth = (struct ethhdr *)data; data += sizeof(struct ethhdr); /* See comment on IGMP in bond_xmit_roundrobin_slave_get() */ if (eth->h_proto == htons(ETH_P_IP)) { const struct iphdr *iph; if (data + sizeof(struct iphdr) > xdp->data_end) goto non_igmp; iph = (struct iphdr *)data; if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) return slave; return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; return bond_get_slave_by_id(bond, slave_id); } return NULL; } static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; slave = bond_xmit_roundrobin_slave_get(bond, skb); if (likely(slave)) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(bond_dev, skb); } static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond) { return rcu_dereference(bond->curr_active_slave); } /* In active-backup mode, we know that bond->curr_active_slave is always valid if * the bond has a usable interface. */ static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; slave = bond_xmit_activebackup_slave_get(bond); if (slave) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(bond_dev, skb); } /* Use this to update slave_array when (a) it's not appropriate to update * slave_array right away (note that update_slave_array() may sleep) * and / or (b) RTNL is not held. */ void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay) { queue_delayed_work(bond->wq, &bond->slave_arr_work, delay); } /* Slave array work handler. Holds only RTNL */ static void bond_slave_arr_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, slave_arr_work.work); int ret; if (!rtnl_trylock()) goto err; ret = bond_update_slave_arr(bond, NULL); rtnl_unlock(); if (ret) { pr_warn_ratelimited("Failed to update slave array from WT\n"); goto err; } return; err: bond_slave_arr_work_rearm(bond, 1); } static void bond_skip_slave(struct bond_up_slave *slaves, struct slave *skipslave) { int idx; /* Rare situation where caller has asked to skip a specific * slave but allocation failed (most likely!). BTW this is * only possible when the call is initiated from * __bond_release_one(). In this situation; overwrite the * skipslave entry in the array with the last entry from the * array to avoid a situation where the xmit path may choose * this to-be-skipped slave to send a packet out. */ for (idx = 0; slaves && idx < slaves->count; idx++) { if (skipslave == slaves->arr[idx]) { slaves->arr[idx] = slaves->arr[slaves->count - 1]; slaves->count--; break; } } } static void bond_set_slave_arr(struct bonding *bond, struct bond_up_slave *usable_slaves, struct bond_up_slave *all_slaves) { struct bond_up_slave *usable, *all; usable = rtnl_dereference(bond->usable_slaves); rcu_assign_pointer(bond->usable_slaves, usable_slaves); kfree_rcu(usable, rcu); all = rtnl_dereference(bond->all_slaves); rcu_assign_pointer(bond->all_slaves, all_slaves); kfree_rcu(all, rcu); } static void bond_reset_slave_arr(struct bonding *bond) { bond_set_slave_arr(bond, NULL, NULL); } /* Build the usable slaves array in control path for modes that use xmit-hash * to determine the slave interface - * (a) BOND_MODE_8023AD * (b) BOND_MODE_XOR * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0 * * The caller is expected to hold RTNL only and NO other lock! */ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) { struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL; struct slave *slave; struct list_head *iter; int agg_id = 0; int ret = 0; might_sleep(); usable_slaves = kzalloc(struct_size(usable_slaves, arr, bond->slave_cnt), GFP_KERNEL); all_slaves = kzalloc(struct_size(all_slaves, arr, bond->slave_cnt), GFP_KERNEL); if (!usable_slaves || !all_slaves) { ret = -ENOMEM; goto out; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; spin_lock_bh(&bond->mode_lock); if (bond_3ad_get_active_agg_info(bond, &ad_info)) { spin_unlock_bh(&bond->mode_lock); pr_debug("bond_3ad_get_active_agg_info failed\n"); /* No active aggragator means it's not safe to use * the previous array. */ bond_reset_slave_arr(bond); goto out; } spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } bond_for_each_slave(bond, slave, iter) { if (skipslave == slave) continue; all_slaves->arr[all_slaves->count++] = slave; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg; agg = SLAVE_AD_INFO(slave)->port.aggregator; if (!agg || agg->aggregator_identifier != agg_id) continue; } if (!bond_slave_can_tx(slave)) continue; slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n", usable_slaves->count); usable_slaves->arr[usable_slaves->count++] = slave; } bond_set_slave_arr(bond, usable_slaves, all_slaves); return ret; out: if (ret != 0 && skipslave) { bond_skip_slave(rtnl_dereference(bond->all_slaves), skipslave); bond_skip_slave(rtnl_dereference(bond->usable_slaves), skipslave); } kfree_rcu(all_slaves, rcu); kfree_rcu(usable_slaves, rcu); return ret; } static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond, struct sk_buff *skb, struct bond_up_slave *slaves) { struct slave *slave; unsigned int count; u32 hash; hash = bond_xmit_hash(bond, skb); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; slave = slaves->arr[hash % count]; return slave; } static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond, struct xdp_buff *xdp) { struct bond_up_slave *slaves; unsigned int count; u32 hash; hash = bond_xmit_hash_xdp(bond, xdp); slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; return slaves->arr[hash % count]; } /* Use this Xmit function for 3AD as well as XOR modes. The current * usable slave array is formed in the control path. The xmit function * just calculates hash and sends the packet out. */ static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct bond_up_slave *slaves; struct slave *slave; slaves = rcu_dereference(bond->usable_slaves); slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); if (likely(slave)) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(dev, skb); } /* in broadcast mode, we send everything to all usable interfaces. */ static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave = NULL; struct list_head *iter; bool xmit_suc = false; bool skb_used = false; bond_for_each_slave_rcu(bond, slave, iter) { struct sk_buff *skb2; if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)) continue; if (bond_is_last_slave(bond, slave)) { skb2 = skb; skb_used = true; } else { skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) { net_err_ratelimited("%s: Error: %s: skb_clone() failed\n", bond_dev->name, __func__); continue; } } if (bond_dev_queue_xmit(bond, skb2, slave->dev) == NETDEV_TX_OK) xmit_suc = true; } if (!skb_used) dev_kfree_skb_any(skb); if (xmit_suc) return NETDEV_TX_OK; dev_core_stats_tx_dropped_inc(bond_dev); return NET_XMIT_DROP; } /*------------------------- Device initialization ---------------------------*/ /* Lookup the slave that corresponds to a qid */ static inline int bond_slave_override(struct bonding *bond, struct sk_buff *skb) { struct slave *slave = NULL; struct list_head *iter; if (!skb_rx_queue_recorded(skb)) return 1; /* Find out if any slaves have the same mapping as this skb. */ bond_for_each_slave_rcu(bond, slave, iter) { if (READ_ONCE(slave->queue_id) == skb_get_queue_mapping(skb)) { if (bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { bond_dev_queue_xmit(bond, skb, slave->dev); return 0; } /* If the slave isn't UP, use default transmit policy. */ break; } } return 1; } static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to * skb_tx_hash and will put the skbs in the queue we expect on their * way down to the bonding driver. */ u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; /* Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb); if (unlikely(txq >= dev->real_num_tx_queues)) { do { txq -= dev->real_num_tx_queues; } while (txq >= dev->real_num_tx_queues); } return txq; } static struct net_device *bond_xmit_get_slave(struct net_device *master_dev, struct sk_buff *skb, bool all_slaves) { struct bonding *bond = netdev_priv(master_dev); struct bond_up_slave *slaves; struct slave *slave = NULL; switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: slave = bond_xmit_roundrobin_slave_get(bond, skb); break; case BOND_MODE_ACTIVEBACKUP: slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR: if (all_slaves) slaves = rcu_dereference(bond->all_slaves); else slaves = rcu_dereference(bond->usable_slaves); slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); break; case BOND_MODE_BROADCAST: break; case BOND_MODE_ALB: slave = bond_xmit_alb_slave_get(bond, skb); break; case BOND_MODE_TLB: slave = bond_xmit_tlb_slave_get(bond, skb); break; default: /* Should never happen, mode already checked */ WARN_ONCE(true, "Unknown bonding mode"); break; } if (slave) return slave->dev; return NULL; } static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow) { switch (sk->sk_family) { #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (ipv6_only_sock(sk) || ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) { flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; flow->addrs.v6addrs.src = inet6_sk(sk)->saddr; flow->addrs.v6addrs.dst = sk->sk_v6_daddr; break; } fallthrough; #endif default: /* AF_INET */ flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr; flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr; break; } flow->ports.src = inet_sk(sk)->inet_sport; flow->ports.dst = inet_sk(sk)->inet_dport; } /** * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields * @sk: socket to use for headers * * This function will extract the necessary field from the socket and use * them to generate a hash based on the LAYER34 xmit_policy. * Assumes that sk is a TCP or UDP socket. */ static u32 bond_sk_hash_l34(struct sock *sk) { struct flow_keys flow; u32 hash; bond_sk_to_flow(sk, &flow); /* L4 */ memcpy(&hash, &flow.ports.ports, sizeof(hash)); /* L3 */ return bond_ip_hash(hash, &flow, BOND_XMIT_POLICY_LAYER34); } static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond, struct sock *sk) { struct bond_up_slave *slaves; struct slave *slave; unsigned int count; u32 hash; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; hash = bond_sk_hash_l34(sk); slave = slaves->arr[hash % count]; return slave->dev; } static struct net_device *bond_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { struct bonding *bond = netdev_priv(dev); struct net_device *lower = NULL; rcu_read_lock(); if (bond_sk_check(bond)) lower = __bond_sk_get_lower_dev(bond, sk); rcu_read_unlock(); return lower; } #if IS_ENABLED(CONFIG_TLS_DEVICE) static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *dev) { struct net_device *tls_netdev = rcu_dereference(tls_get_ctx(skb->sk)->netdev); /* tls_netdev might become NULL, even if tls_is_skb_tx_device_offloaded * was true, if tls_device_down is running in parallel, but it's OK, * because bond_get_slave_by_dev has a NULL check. */ if (likely(bond_get_slave_by_dev(bond, tls_netdev))) return bond_dev_queue_xmit(bond, skb, tls_netdev); return bond_tx_drop(dev, skb); } #endif static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); if (bond_should_override_tx_queue(bond) && !bond_slave_override(bond, skb)) return NETDEV_TX_OK; #if IS_ENABLED(CONFIG_TLS_DEVICE) if (tls_is_skb_tx_device_offloaded(skb)) return bond_tls_device_xmit(bond, skb, dev); #endif switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return bond_xmit_roundrobin(skb, dev); case BOND_MODE_ACTIVEBACKUP: return bond_xmit_activebackup(skb, dev); case BOND_MODE_8023AD: case BOND_MODE_XOR: return bond_3ad_xor_xmit(skb, dev); case BOND_MODE_BROADCAST: return bond_xmit_broadcast(skb, dev); case BOND_MODE_ALB: return bond_alb_xmit(skb, dev); case BOND_MODE_TLB: return bond_tlb_xmit(skb, dev); default: /* Should never happen, mode already checked */ netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond)); WARN_ON_ONCE(1); return bond_tx_drop(dev, skb); } } static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); netdev_tx_t ret = NETDEV_TX_OK; /* If we risk deadlock from transmitting this in the * netpoll path, tell netpoll to queue the frame for later tx */ if (unlikely(is_netpoll_tx_blocked(dev))) return NETDEV_TX_BUSY; rcu_read_lock(); if (bond_has_slaves(bond)) ret = __bond_start_xmit(skb, dev); else ret = bond_tx_drop(dev, skb); rcu_read_unlock(); return ret; } static struct net_device * bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; /* Caller needs to hold rcu_read_lock() */ switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp); break; case BOND_MODE_ACTIVEBACKUP: slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR: slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp); break; default: /* Should never happen. Mode guarded by bond_xdp_check() */ netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond)); WARN_ON_ONCE(1); return NULL; } if (slave) return slave->dev; return NULL; } static int bond_xdp_xmit(struct net_device *bond_dev, int n, struct xdp_frame **frames, u32 flags) { int nxmit, err = -ENXIO; rcu_read_lock(); for (nxmit = 0; nxmit < n; nxmit++) { struct xdp_frame *frame = frames[nxmit]; struct xdp_frame *frames1[] = {frame}; struct net_device *slave_dev; struct xdp_buff xdp; xdp_convert_frame_to_buff(frame, &xdp); slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp); if (!slave_dev) { err = -ENXIO; break; } err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags); if (err < 1) break; } rcu_read_unlock(); /* If error happened on the first frame then we can pass the error up, otherwise * report the number of frames that were xmitted. */ if (err < 0) return (nxmit == 0 ? err : nxmit); return nxmit; } static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; struct slave *slave, *rollback_slave; struct bpf_prog *old_prog; struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = prog, .extack = extack, }; int err; ASSERT_RTNL(); if (!bond_xdp_check(bond)) return -EOPNOTSUPP; old_prog = bond->xdp_prog; bond->xdp_prog = prog; bond_for_each_slave(bond, slave, iter) { struct net_device *slave_dev = slave->dev; if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { SLAVE_NL_ERR(dev, slave_dev, extack, "Slave device does not support XDP"); err = -EOPNOTSUPP; goto err; } if (dev_xdp_prog_count(slave_dev) > 0) { SLAVE_NL_ERR(dev, slave_dev, extack, "Slave has XDP program loaded, please unload before enslaving"); err = -EOPNOTSUPP; goto err; } err = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); if (err < 0) { /* ndo_bpf() sets extack error message */ slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err); goto err; } if (prog) bpf_prog_inc(prog); } if (prog) { static_branch_inc(&bpf_master_redirect_enabled_key); } else if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&bpf_master_redirect_enabled_key); } return 0; err: /* unwind the program changes */ bond->xdp_prog = old_prog; xdp.prog = old_prog; xdp.extack = NULL; /* do not overwrite original error */ bond_for_each_slave(bond, rollback_slave, iter) { struct net_device *slave_dev = rollback_slave->dev; int err_unwind; if (slave == rollback_slave) break; err_unwind = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); if (err_unwind < 0) slave_err(dev, slave_dev, "Error %d when unwinding XDP program change\n", err_unwind); else if (xdp.prog) bpf_prog_inc(xdp.prog); } return err; } static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return bond_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) { if (speed == 0 || speed == SPEED_UNKNOWN) speed = slave->speed; else speed = min(speed, slave->speed); return speed; } /* Set the BOND_PHC_INDEX flag to notify user space */ static int bond_set_phc_index_flag(struct kernel_hwtstamp_config *kernel_cfg) { struct ifreq *ifr = kernel_cfg->ifr; struct hwtstamp_config cfg; if (kernel_cfg->copied_to_user) { /* Lower device has a legacy implementation */ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) return -EFAULT; cfg.flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX; if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } else { kernel_cfg->flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX; } return 0; } static int bond_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct bonding *bond = netdev_priv(dev); struct net_device *real_dev; int err; real_dev = bond_option_active_slave_get_rcu(bond); if (!real_dev) return -EOPNOTSUPP; err = generic_hwtstamp_get_lower(real_dev, cfg); if (err) return err; return bond_set_phc_index_flag(cfg); } static int bond_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(dev); struct net_device *real_dev; int err; if (!(cfg->flags & HWTSTAMP_FLAG_BONDED_PHC_INDEX)) return -EOPNOTSUPP; real_dev = bond_option_active_slave_get_rcu(bond); if (!real_dev) return -EOPNOTSUPP; err = generic_hwtstamp_set_lower(real_dev, cfg, extack); if (err) return err; return bond_set_phc_index_flag(cfg); } static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, struct ethtool_link_ksettings *cmd) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; u32 speed = 0; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; /* Since bond_slave_can_tx returns false for all inactive or down slaves, we * do not need to check mode. Though link speed might not represent * the true receive or transmit bandwidth (not all modes are symmetric) * this is an accurate maximum. */ bond_for_each_slave(bond, slave, iter) { if (bond_slave_can_tx(slave)) { bond_update_speed_duplex(slave); if (slave->speed != SPEED_UNKNOWN) { if (BOND_MODE(bond) == BOND_MODE_BROADCAST) speed = bond_mode_bcast_speed(slave, speed); else speed += slave->speed; } if (cmd->base.duplex == DUPLEX_UNKNOWN && slave->duplex != DUPLEX_UNKNOWN) cmd->base.duplex = slave->duplex; } } cmd->base.speed = speed ? : SPEED_UNKNOWN; return 0; } static void bond_ethtool_get_drvinfo(struct net_device *bond_dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d", BOND_ABI_VERSION); } static int bond_ethtool_get_ts_info(struct net_device *bond_dev, struct kernel_ethtool_ts_info *info) { struct bonding *bond = netdev_priv(bond_dev); struct kernel_ethtool_ts_info ts_info; struct net_device *real_dev; bool sw_tx_support = false; struct list_head *iter; struct slave *slave; int ret = 0; rcu_read_lock(); real_dev = bond_option_active_slave_get_rcu(bond); dev_hold(real_dev); rcu_read_unlock(); if (real_dev) { ret = ethtool_get_ts_info_by_layer(real_dev, info); } else { info->phc_index = -1; info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; /* Check if all slaves support software tx timestamping */ rcu_read_lock(); bond_for_each_slave_rcu(bond, slave, iter) { ret = ethtool_get_ts_info_by_layer(slave->dev, &ts_info); if (!ret && (ts_info.so_timestamping & SOF_TIMESTAMPING_TX_SOFTWARE)) { sw_tx_support = true; continue; } sw_tx_support = false; break; } rcu_read_unlock(); } if (sw_tx_support) info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; dev_put(real_dev); return ret; } static const struct ethtool_ops bond_ethtool_ops = { .get_drvinfo = bond_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = bond_ethtool_get_link_ksettings, .get_ts_info = bond_ethtool_get_ts_info, }; static const struct net_device_ops bond_netdev_ops = { .ndo_init = bond_init, .ndo_uninit = bond_uninit, .ndo_open = bond_open, .ndo_stop = bond_close, .ndo_start_xmit = bond_start_xmit, .ndo_select_queue = bond_select_queue, .ndo_get_stats64 = bond_get_stats, .ndo_eth_ioctl = bond_eth_ioctl, .ndo_siocbond = bond_do_ioctl, .ndo_siocdevprivate = bond_siocdevprivate, .ndo_change_rx_flags = bond_change_rx_flags, .ndo_set_rx_mode = bond_set_rx_mode, .ndo_change_mtu = bond_change_mtu, .ndo_set_mac_address = bond_set_mac_address, .ndo_neigh_setup = bond_neigh_setup, .ndo_vlan_rx_add_vid = bond_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = bond_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = bond_netpoll_setup, .ndo_netpoll_cleanup = bond_netpoll_cleanup, .ndo_poll_controller = bond_poll_controller, #endif .ndo_add_slave = bond_enslave, .ndo_del_slave = bond_release, .ndo_fix_features = bond_fix_features, .ndo_features_check = passthru_features_check, .ndo_get_xmit_slave = bond_xmit_get_slave, .ndo_sk_get_lower_dev = bond_sk_get_lower_dev, .ndo_bpf = bond_xdp, .ndo_xdp_xmit = bond_xdp_xmit, .ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave, .ndo_hwtstamp_get = bond_hwtstamp_get, .ndo_hwtstamp_set = bond_hwtstamp_set, }; static const struct device_type bond_type = { .name = "bond", }; static void bond_destructor(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); if (bond->wq) destroy_workqueue(bond->wq); free_percpu(bond->rr_tx_counter); } void bond_setup(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); spin_lock_init(&bond->mode_lock); bond->params = bonding_defaults; /* Initialize pointers */ bond->dev = bond_dev; /* Initialize the device entry points */ ether_setup(bond_dev); bond_dev->max_mtu = ETH_MAX_MTU; bond_dev->netdev_ops = &bond_netdev_ops; bond_dev->ethtool_ops = &bond_ethtool_ops; bond_dev->needs_free_netdev = true; bond_dev->priv_destructor = bond_destructor; SET_NETDEV_DEVTYPE(bond_dev, &bond_type); /* Initialize the device options */ bond_dev->flags |= IFF_MASTER; bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE; bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); #ifdef CONFIG_XFRM_OFFLOAD /* set up xfrm device ops (only supported in active-backup right now) */ bond_dev->xfrmdev_ops = &bond_xfrmdev_ops; INIT_LIST_HEAD(&bond->ipsec_list); mutex_init(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ /* don't acquire bond device's netif_tx_lock when transmitting */ bond_dev->features |= NETIF_F_LLTX; /* By default, we declare the bond to be fully * VLAN hardware accelerated capable. Special * care is taken in the various xmit functions * when there are slaves that are not hw accel * capable */ /* Don't allow bond devices to change network namespaces. */ bond_dev->features |= NETIF_F_NETNS_LOCAL; bond_dev->hw_features = BOND_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_FILTER; bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_features |= BOND_XFRM_FEATURES; /* Only enable XFRM features if this is an active-backup config */ if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_dev->features |= BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ } /* Destroy a bonding device. * Must be under rtnl_lock when this function is called. */ static void bond_uninit(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_netpoll_cleanup(bond_dev); /* Release the bonded slaves */ bond_for_each_slave(bond, slave, iter) __bond_release_one(bond_dev, slave->dev, true, true); netdev_info(bond_dev, "Released all slaves\n"); #ifdef CONFIG_XFRM_OFFLOAD mutex_destroy(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ bond_set_slave_arr(bond, NULL, NULL); list_del_rcu(&bond->bond_list); bond_debug_unregister(bond); } /*------------------------- Module initialization ---------------------------*/ static int __init bond_check_params(struct bond_params *params) { int arp_validate_value, fail_over_mac_value, primary_reselect_value, i; struct bond_opt_value newval; const struct bond_opt_value *valptr; int arp_all_targets_value = 0; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; __be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 }; int arp_ip_count; int bond_mode = BOND_MODE_ROUNDROBIN; int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; int lacp_fast = 0; int tlb_dynamic_lb; /* Convert string parameters. */ if (mode) { bond_opt_initstr(&newval, mode); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval); if (!valptr) { pr_err("Error: Invalid bonding mode \"%s\"\n", mode); return -EINVAL; } bond_mode = valptr->value; } if (xmit_hash_policy) { if (bond_mode == BOND_MODE_ROUNDROBIN || bond_mode == BOND_MODE_ACTIVEBACKUP || bond_mode == BOND_MODE_BROADCAST) { pr_info("xmit_hash_policy param is irrelevant in mode %s\n", bond_mode_name(bond_mode)); } else { bond_opt_initstr(&newval, xmit_hash_policy); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH), &newval); if (!valptr) { pr_err("Error: Invalid xmit_hash_policy \"%s\"\n", xmit_hash_policy); return -EINVAL; } xmit_hashtype = valptr->value; } } if (lacp_rate) { if (bond_mode != BOND_MODE_8023AD) { pr_info("lacp_rate param is irrelevant in mode %s\n", bond_mode_name(bond_mode)); } else { bond_opt_initstr(&newval, lacp_rate); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE), &newval); if (!valptr) { pr_err("Error: Invalid lacp rate \"%s\"\n", lacp_rate); return -EINVAL; } lacp_fast = valptr->value; } } if (ad_select) { bond_opt_initstr(&newval, ad_select); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT), &newval); if (!valptr) { pr_err("Error: Invalid ad_select \"%s\"\n", ad_select); return -EINVAL; } params->ad_select = valptr->value; if (bond_mode != BOND_MODE_8023AD) pr_warn("ad_select param only affects 802.3ad mode\n"); } else { params->ad_select = BOND_AD_STABLE; } if (max_bonds < 0) { pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n", max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS); max_bonds = BOND_DEFAULT_MAX_BONDS; } if (miimon < 0) { pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n", miimon, INT_MAX); miimon = 0; } if (updelay < 0) { pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n", updelay, INT_MAX); updelay = 0; } if (downdelay < 0) { pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n", downdelay, INT_MAX); downdelay = 0; } if ((use_carrier != 0) && (use_carrier != 1)) { pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n", use_carrier); use_carrier = 1; } if (num_peer_notif < 0 || num_peer_notif > 255) { pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n", num_peer_notif); num_peer_notif = 1; } /* reset values for 802.3ad/TLB/ALB */ if (!bond_mode_uses_arp(bond_mode)) { if (!miimon) { pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n"); pr_warn("Forcing miimon to 100msec\n"); miimon = BOND_DEFAULT_MIIMON; } } if (tx_queues < 1 || tx_queues > 255) { pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n", tx_queues, BOND_DEFAULT_TX_QUEUES); tx_queues = BOND_DEFAULT_TX_QUEUES; } if ((all_slaves_active != 0) && (all_slaves_active != 1)) { pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n", all_slaves_active); all_slaves_active = 0; } if (resend_igmp < 0 || resend_igmp > 255) { pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n", resend_igmp, BOND_DEFAULT_RESEND_IGMP); resend_igmp = BOND_DEFAULT_RESEND_IGMP; } bond_opt_initval(&newval, packets_per_slave); if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) { pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n", packets_per_slave, USHRT_MAX); packets_per_slave = 1; } if (bond_mode == BOND_MODE_ALB) { pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n", updelay); } if (!miimon) { if (updelay || downdelay) { /* just warn the user the up/down delay will have * no effect since miimon is zero... */ pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n", updelay, downdelay); } } else { /* don't allow arp monitoring */ if (arp_interval) { pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n", miimon, arp_interval); arp_interval = 0; } if ((updelay % miimon) != 0) { pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n", updelay, miimon, (updelay / miimon) * miimon); } updelay /= miimon; if ((downdelay % miimon) != 0) { pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n", downdelay, miimon, (downdelay / miimon) * miimon); } downdelay /= miimon; } if (arp_interval < 0) { pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n", arp_interval, INT_MAX); arp_interval = 0; } for (arp_ip_count = 0, i = 0; (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) { __be32 ip; /* not a complete check, but good enough to catch mistakes */ if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) || !bond_is_ip_target_ok(ip)) { pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n", arp_ip_target[i]); arp_interval = 0; } else { if (bond_get_targets_ip(arp_target, ip) == -1) arp_target[arp_ip_count++] = ip; else pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n", &ip); } } if (arp_interval && !arp_ip_count) { /* don't allow arping if no arp_ip_target given... */ pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n", arp_interval); arp_interval = 0; } if (arp_validate) { if (!arp_interval) { pr_err("arp_validate requires arp_interval\n"); return -EINVAL; } bond_opt_initstr(&newval, arp_validate); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE), &newval); if (!valptr) { pr_err("Error: invalid arp_validate \"%s\"\n", arp_validate); return -EINVAL; } arp_validate_value = valptr->value; } else { arp_validate_value = 0; } if (arp_all_targets) { bond_opt_initstr(&newval, arp_all_targets); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS), &newval); if (!valptr) { pr_err("Error: invalid arp_all_targets_value \"%s\"\n", arp_all_targets); arp_all_targets_value = 0; } else { arp_all_targets_value = valptr->value; } } if (miimon) { pr_info("MII link monitoring set to %d ms\n", miimon); } else if (arp_interval) { valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE, arp_validate_value); pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):", arp_interval, valptr->string, arp_ip_count); for (i = 0; i < arp_ip_count; i++) pr_cont(" %s", arp_ip_target[i]); pr_cont("\n"); } else if (max_bonds) { /* miimon and arp_interval not set, we need one so things * work as expected, see bonding.txt for details */ pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n"); } if (primary && !bond_mode_uses_primary(bond_mode)) { /* currently, using a primary only makes sense * in active backup, TLB or ALB modes */ pr_warn("Warning: %s primary device specified but has no effect in %s mode\n", primary, bond_mode_name(bond_mode)); primary = NULL; } if (primary && primary_reselect) { bond_opt_initstr(&newval, primary_reselect); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT), &newval); if (!valptr) { pr_err("Error: Invalid primary_reselect \"%s\"\n", primary_reselect); return -EINVAL; } primary_reselect_value = valptr->value; } else { primary_reselect_value = BOND_PRI_RESELECT_ALWAYS; } if (fail_over_mac) { bond_opt_initstr(&newval, fail_over_mac); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC), &newval); if (!valptr) { pr_err("Error: invalid fail_over_mac \"%s\"\n", fail_over_mac); return -EINVAL; } fail_over_mac_value = valptr->value; if (bond_mode != BOND_MODE_ACTIVEBACKUP) pr_warn("Warning: fail_over_mac only affects active-backup mode\n"); } else { fail_over_mac_value = BOND_FOM_NONE; } bond_opt_initstr(&newval, "default"); valptr = bond_opt_parse( bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO), &newval); if (!valptr) { pr_err("Error: No ad_actor_sys_prio default value"); return -EINVAL; } ad_actor_sys_prio = valptr->value; valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY), &newval); if (!valptr) { pr_err("Error: No ad_user_port_key default value"); return -EINVAL; } ad_user_port_key = valptr->value; bond_opt_initstr(&newval, "default"); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval); if (!valptr) { pr_err("Error: No tlb_dynamic_lb default value"); return -EINVAL; } tlb_dynamic_lb = valptr->value; if (lp_interval == 0) { pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL); lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL; } /* fill params struct with the proper values */ params->mode = bond_mode; params->xmit_policy = xmit_hashtype; params->miimon = miimon; params->num_peer_notif = num_peer_notif; params->arp_interval = arp_interval; params->arp_validate = arp_validate_value; params->arp_all_targets = arp_all_targets_value; params->missed_max = 2; params->updelay = updelay; params->downdelay = downdelay; params->peer_notif_delay = 0; params->use_carrier = use_carrier; params->lacp_active = 1; params->lacp_fast = lacp_fast; params->primary[0] = 0; params->primary_reselect = primary_reselect_value; params->fail_over_mac = fail_over_mac_value; params->tx_queues = tx_queues; params->all_slaves_active = all_slaves_active; params->resend_igmp = resend_igmp; params->min_links = min_links; params->lp_interval = lp_interval; params->packets_per_slave = packets_per_slave; params->tlb_dynamic_lb = tlb_dynamic_lb; params->ad_actor_sys_prio = ad_actor_sys_prio; eth_zero_addr(params->ad_actor_system); params->ad_user_port_key = ad_user_port_key; params->coupled_control = 1; if (packets_per_slave > 0) { params->reciprocal_packets_per_slave = reciprocal_value(packets_per_slave); } else { /* reciprocal_packets_per_slave is unused if * packets_per_slave is 0 or 1, just initialize it */ params->reciprocal_packets_per_slave = (struct reciprocal_value) { 0 }; } if (primary) strscpy_pad(params->primary, primary, sizeof(params->primary)); memcpy(params->arp_targets, arp_target, sizeof(arp_target)); #if IS_ENABLED(CONFIG_IPV6) memset(params->ns_targets, 0, sizeof(struct in6_addr) * BOND_MAX_NS_TARGETS); #endif return 0; } /* Called from registration process */ static int bond_init(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); netdev_dbg(bond_dev, "Begin bond_init\n"); bond->wq = alloc_ordered_workqueue(bond_dev->name, WQ_MEM_RECLAIM); if (!bond->wq) return -ENOMEM; bond->notifier_ctx = false; spin_lock_init(&bond->stats_lock); netdev_lockdep_set_classes(bond_dev); list_add_tail_rcu(&bond->bond_list, &bn->dev_list); bond_prepare_sysfs_group(bond); bond_debug_register(bond); /* Ensure valid dev_addr */ if (is_zero_ether_addr(bond_dev->dev_addr) && bond_dev->addr_assign_type == NET_ADDR_PERM) eth_hw_addr_random(bond_dev); return 0; } unsigned int bond_get_num_tx_queues(void) { return tx_queues; } /* Create a new bond based on the specified name and bonding parameters. * If name is NULL, obtain a suitable "bond%d" name for us. * Caller must NOT hold rtnl_lock; we need to release it here before we * set up our sysfs entries. */ int bond_create(struct net *net, const char *name) { struct net_device *bond_dev; struct bonding *bond; int res = -ENOMEM; rtnl_lock(); bond_dev = alloc_netdev_mq(sizeof(struct bonding), name ? name : "bond%d", NET_NAME_UNKNOWN, bond_setup, tx_queues); if (!bond_dev) goto out; bond = netdev_priv(bond_dev); dev_net_set(bond_dev, net); bond_dev->rtnl_link_ops = &bond_link_ops; res = register_netdevice(bond_dev); if (res < 0) { free_netdev(bond_dev); goto out; } netif_carrier_off(bond_dev); bond_work_init_all(bond); out: rtnl_unlock(); return res; } static int __net_init bond_net_init(struct net *net) { struct bond_net *bn = net_generic(net, bond_net_id); bn->net = net; INIT_LIST_HEAD(&bn->dev_list); bond_create_proc_dir(bn); bond_create_sysfs(bn); return 0; } /* According to commit 69b0216ac255 ("bonding: fix bonding_masters * race condition in bond unloading") we need to remove sysfs files * before we remove our devices (done later in bond_net_exit_batch_rtnl()) */ static void __net_exit bond_net_pre_exit(struct net *net) { struct bond_net *bn = net_generic(net, bond_net_id); bond_destroy_sysfs(bn); } static void __net_exit bond_net_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_kill_list) { struct bond_net *bn; struct net *net; /* Kill off any bonds created after unregistering bond rtnl ops */ list_for_each_entry(net, net_list, exit_list) { struct bonding *bond, *tmp_bond; bn = net_generic(net, bond_net_id); list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) unregister_netdevice_queue(bond->dev, dev_kill_list); } } /* According to commit 23fa5c2caae0 ("bonding: destroy proc directory * only after all bonds are gone") bond_destroy_proc_dir() is called * after bond_net_exit_batch_rtnl() has completed. */ static void __net_exit bond_net_exit_batch(struct list_head *net_list) { struct bond_net *bn; struct net *net; list_for_each_entry(net, net_list, exit_list) { bn = net_generic(net, bond_net_id); bond_destroy_proc_dir(bn); } } static struct pernet_operations bond_net_ops = { .init = bond_net_init, .pre_exit = bond_net_pre_exit, .exit_batch_rtnl = bond_net_exit_batch_rtnl, .exit_batch = bond_net_exit_batch, .id = &bond_net_id, .size = sizeof(struct bond_net), }; static int __init bonding_init(void) { int i; int res; res = bond_check_params(&bonding_defaults); if (res) goto out; bond_create_debugfs(); res = register_pernet_subsys(&bond_net_ops); if (res) goto err_net_ops; res = bond_netlink_init(); if (res) goto err_link; for (i = 0; i < max_bonds; i++) { res = bond_create(&init_net, NULL); if (res) goto err; } skb_flow_dissector_init(&flow_keys_bonding, flow_keys_bonding_keys, ARRAY_SIZE(flow_keys_bonding_keys)); register_netdevice_notifier(&bond_netdev_notifier); out: return res; err: bond_netlink_fini(); err_link: unregister_pernet_subsys(&bond_net_ops); err_net_ops: bond_destroy_debugfs(); goto out; } static void __exit bonding_exit(void) { unregister_netdevice_notifier(&bond_netdev_notifier); bond_netlink_fini(); unregister_pernet_subsys(&bond_net_ops); bond_destroy_debugfs(); #ifdef CONFIG_NET_POLL_CONTROLLER /* Make sure we don't have an imbalance on our netpoll blocking */ WARN_ON(atomic_read(&netpoll_block_tx)); #endif } module_init(bonding_init); module_exit(bonding_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others"); |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 | /* * Copyright (c) 2018 Cumulus Networks. All rights reserved. * Copyright (c) 2018 David Ahern <dsa@cumulusnetworks.com> * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bitmap.h> #include <linux/in6.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/rhashtable.h> #include <linux/spinlock_types.h> #include <linux/types.h> #include <net/fib_notifier.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/fib_rules.h> #include <net/net_namespace.h> #include <net/nexthop.h> #include <linux/debugfs.h> #include "netdevsim.h" struct nsim_fib_entry { u64 max; atomic64_t num; }; struct nsim_per_fib_data { struct nsim_fib_entry fib; struct nsim_fib_entry rules; }; struct nsim_fib_data { struct notifier_block fib_nb; struct nsim_per_fib_data ipv4; struct nsim_per_fib_data ipv6; struct nsim_fib_entry nexthops; struct rhashtable fib_rt_ht; struct list_head fib_rt_list; struct mutex fib_lock; /* Protects FIB HT and list */ struct notifier_block nexthop_nb; struct rhashtable nexthop_ht; struct devlink *devlink; struct work_struct fib_event_work; struct work_struct fib_flush_work; struct list_head fib_event_queue; spinlock_t fib_event_queue_lock; /* Protects fib event queue list */ struct mutex nh_lock; /* Protects NH HT */ struct dentry *ddir; bool fail_route_offload; bool fail_res_nexthop_group_replace; bool fail_nexthop_bucket_replace; bool fail_route_delete; }; struct nsim_fib_rt_key { unsigned char addr[sizeof(struct in6_addr)]; unsigned char prefix_len; int family; u32 tb_id; }; struct nsim_fib_rt { struct nsim_fib_rt_key key; struct rhash_head ht_node; struct list_head list; /* Member of fib_rt_list */ }; struct nsim_fib4_rt { struct nsim_fib_rt common; struct fib_info *fi; dscp_t dscp; u8 type; }; struct nsim_fib6_rt { struct nsim_fib_rt common; struct list_head nh_list; unsigned int nhs; }; struct nsim_fib6_rt_nh { struct list_head list; /* Member of nh_list */ struct fib6_info *rt; }; struct nsim_fib6_event { struct fib6_info **rt_arr; unsigned int nrt6; }; struct nsim_fib_event { struct list_head list; /* node in fib queue */ union { struct fib_entry_notifier_info fen_info; struct nsim_fib6_event fib6_event; }; struct nsim_fib_data *data; unsigned long event; int family; }; static const struct rhashtable_params nsim_fib_rt_ht_params = { .key_offset = offsetof(struct nsim_fib_rt, key), .head_offset = offsetof(struct nsim_fib_rt, ht_node), .key_len = sizeof(struct nsim_fib_rt_key), .automatic_shrinking = true, }; struct nsim_nexthop { struct rhash_head ht_node; u64 occ; u32 id; bool is_resilient; }; static const struct rhashtable_params nsim_nexthop_ht_params = { .key_offset = offsetof(struct nsim_nexthop, id), .head_offset = offsetof(struct nsim_nexthop, ht_node), .key_len = sizeof(u32), .automatic_shrinking = true, }; u64 nsim_fib_get_val(struct nsim_fib_data *fib_data, enum nsim_resource_id res_id, bool max) { struct nsim_fib_entry *entry; switch (res_id) { case NSIM_RESOURCE_IPV4_FIB: entry = &fib_data->ipv4.fib; break; case NSIM_RESOURCE_IPV4_FIB_RULES: entry = &fib_data->ipv4.rules; break; case NSIM_RESOURCE_IPV6_FIB: entry = &fib_data->ipv6.fib; break; case NSIM_RESOURCE_IPV6_FIB_RULES: entry = &fib_data->ipv6.rules; break; case NSIM_RESOURCE_NEXTHOPS: entry = &fib_data->nexthops; break; default: return 0; } return max ? entry->max : atomic64_read(&entry->num); } static void nsim_fib_set_max(struct nsim_fib_data *fib_data, enum nsim_resource_id res_id, u64 val) { struct nsim_fib_entry *entry; switch (res_id) { case NSIM_RESOURCE_IPV4_FIB: entry = &fib_data->ipv4.fib; break; case NSIM_RESOURCE_IPV4_FIB_RULES: entry = &fib_data->ipv4.rules; break; case NSIM_RESOURCE_IPV6_FIB: entry = &fib_data->ipv6.fib; break; case NSIM_RESOURCE_IPV6_FIB_RULES: entry = &fib_data->ipv6.rules; break; case NSIM_RESOURCE_NEXTHOPS: entry = &fib_data->nexthops; break; default: WARN_ON(1); return; } entry->max = val; } static int nsim_fib_rule_account(struct nsim_fib_entry *entry, bool add, struct netlink_ext_ack *extack) { int err = 0; if (add) { if (!atomic64_add_unless(&entry->num, 1, entry->max)) { err = -ENOSPC; NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib rule entries"); } } else { atomic64_dec_if_positive(&entry->num); } return err; } static int nsim_fib_rule_event(struct nsim_fib_data *data, struct fib_notifier_info *info, bool add) { struct netlink_ext_ack *extack = info->extack; int err = 0; switch (info->family) { case AF_INET: err = nsim_fib_rule_account(&data->ipv4.rules, add, extack); break; case AF_INET6: err = nsim_fib_rule_account(&data->ipv6.rules, add, extack); break; } return err; } static int nsim_fib_account(struct nsim_fib_entry *entry, bool add) { int err = 0; if (add) { if (!atomic64_add_unless(&entry->num, 1, entry->max)) err = -ENOSPC; } else { atomic64_dec_if_positive(&entry->num); } return err; } static void nsim_fib_rt_init(struct nsim_fib_data *data, struct nsim_fib_rt *fib_rt, const void *addr, size_t addr_len, unsigned int prefix_len, int family, u32 tb_id) { memcpy(fib_rt->key.addr, addr, addr_len); fib_rt->key.prefix_len = prefix_len; fib_rt->key.family = family; fib_rt->key.tb_id = tb_id; list_add(&fib_rt->list, &data->fib_rt_list); } static void nsim_fib_rt_fini(struct nsim_fib_rt *fib_rt) { list_del(&fib_rt->list); } static struct nsim_fib_rt *nsim_fib_rt_lookup(struct rhashtable *fib_rt_ht, const void *addr, size_t addr_len, unsigned int prefix_len, int family, u32 tb_id) { struct nsim_fib_rt_key key; memset(&key, 0, sizeof(key)); memcpy(key.addr, addr, addr_len); key.prefix_len = prefix_len; key.family = family; key.tb_id = tb_id; return rhashtable_lookup_fast(fib_rt_ht, &key, nsim_fib_rt_ht_params); } static struct nsim_fib4_rt * nsim_fib4_rt_create(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt; fib4_rt = kzalloc(sizeof(*fib4_rt), GFP_KERNEL); if (!fib4_rt) return NULL; nsim_fib_rt_init(data, &fib4_rt->common, &fen_info->dst, sizeof(u32), fen_info->dst_len, AF_INET, fen_info->tb_id); fib4_rt->fi = fen_info->fi; fib_info_hold(fib4_rt->fi); fib4_rt->dscp = fen_info->dscp; fib4_rt->type = fen_info->type; return fib4_rt; } static void nsim_fib4_rt_destroy(struct nsim_fib4_rt *fib4_rt) { fib_info_put(fib4_rt->fi); nsim_fib_rt_fini(&fib4_rt->common); kfree(fib4_rt); } static struct nsim_fib4_rt * nsim_fib4_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib_entry_notifier_info *fen_info) { struct nsim_fib_rt *fib_rt; fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &fen_info->dst, sizeof(u32), fen_info->dst_len, AF_INET, fen_info->tb_id); if (!fib_rt) return NULL; return container_of(fib_rt, struct nsim_fib4_rt, common); } static void nsim_fib4_rt_offload_failed_flag_set(struct net *net, struct fib_entry_notifier_info *fen_info) { u32 *p_dst = (u32 *)&fen_info->dst; struct fib_rt_info fri; fri.fi = fen_info->fi; fri.tb_id = fen_info->tb_id; fri.dst = cpu_to_be32(*p_dst); fri.dst_len = fen_info->dst_len; fri.dscp = fen_info->dscp; fri.type = fen_info->type; fri.offload = false; fri.trap = false; fri.offload_failed = true; fib_alias_hw_flags_set(net, &fri); } static void nsim_fib4_rt_hw_flags_set(struct net *net, const struct nsim_fib4_rt *fib4_rt, bool trap) { u32 *p_dst = (u32 *) fib4_rt->common.key.addr; int dst_len = fib4_rt->common.key.prefix_len; struct fib_rt_info fri; fri.fi = fib4_rt->fi; fri.tb_id = fib4_rt->common.key.tb_id; fri.dst = cpu_to_be32(*p_dst); fri.dst_len = dst_len; fri.dscp = fib4_rt->dscp; fri.type = fib4_rt->type; fri.offload = false; fri.trap = trap; fri.offload_failed = false; fib_alias_hw_flags_set(net, &fri); } static int nsim_fib4_rt_add(struct nsim_fib_data *data, struct nsim_fib4_rt *fib4_rt) { struct net *net = devlink_net(data->devlink); int err; err = rhashtable_insert_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) goto err_fib_dismiss; /* Simulate hardware programming latency. */ msleep(1); nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); return 0; err_fib_dismiss: /* Drop the accounting that was increased from the notification * context when FIB_EVENT_ENTRY_REPLACE was triggered. */ nsim_fib_account(&data->ipv4.fib, false); return err; } static int nsim_fib4_rt_replace(struct nsim_fib_data *data, struct nsim_fib4_rt *fib4_rt, struct nsim_fib4_rt *fib4_rt_old) { struct net *net = devlink_net(data->devlink); int err; /* We are replacing a route, so need to remove the accounting which * was increased when FIB_EVENT_ENTRY_REPLACE was triggered. */ err = nsim_fib_account(&data->ipv4.fib, false); if (err) return err; err = rhashtable_replace_fast(&data->fib_rt_ht, &fib4_rt_old->common.ht_node, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) return err; msleep(1); nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); nsim_fib4_rt_hw_flags_set(net, fib4_rt_old, false); nsim_fib4_rt_destroy(fib4_rt_old); return 0; } static int nsim_fib4_rt_insert(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt, *fib4_rt_old; int err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib4_rt = nsim_fib4_rt_create(data, fen_info); if (!fib4_rt) return -ENOMEM; fib4_rt_old = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); if (!fib4_rt_old) err = nsim_fib4_rt_add(data, fib4_rt); else err = nsim_fib4_rt_replace(data, fib4_rt, fib4_rt_old); if (err) nsim_fib4_rt_destroy(fib4_rt); return err; } static void nsim_fib4_rt_remove(struct nsim_fib_data *data, const struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt; fib4_rt = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); if (!fib4_rt) return; rhashtable_remove_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); nsim_fib4_rt_destroy(fib4_rt); } static int nsim_fib4_event(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info, unsigned long event) { int err = 0; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib4_rt_insert(data, fen_info); if (err) { struct net *net = devlink_net(data->devlink); nsim_fib4_rt_offload_failed_flag_set(net, fen_info); } break; case FIB_EVENT_ENTRY_DEL: nsim_fib4_rt_remove(data, fen_info); break; default: break; } return err; } static struct nsim_fib6_rt_nh * nsim_fib6_rt_nh_find(const struct nsim_fib6_rt *fib6_rt, const struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) { if (fib6_rt_nh->rt == rt) return fib6_rt_nh; } return NULL; } static int nsim_fib6_rt_nh_add(struct nsim_fib6_rt *fib6_rt, struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; fib6_rt_nh = kzalloc(sizeof(*fib6_rt_nh), GFP_KERNEL); if (!fib6_rt_nh) return -ENOMEM; fib6_info_hold(rt); fib6_rt_nh->rt = rt; list_add_tail(&fib6_rt_nh->list, &fib6_rt->nh_list); fib6_rt->nhs++; return 0; } #if IS_ENABLED(CONFIG_IPV6) static void nsim_rt6_release(struct fib6_info *rt) { fib6_info_release(rt); } #else static void nsim_rt6_release(struct fib6_info *rt) { } #endif static void nsim_fib6_rt_nh_del(struct nsim_fib6_rt *fib6_rt, const struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; fib6_rt_nh = nsim_fib6_rt_nh_find(fib6_rt, rt); if (!fib6_rt_nh) return; fib6_rt->nhs--; list_del(&fib6_rt_nh->list); nsim_rt6_release(fib6_rt_nh->rt); kfree(fib6_rt_nh); } static struct nsim_fib6_rt * nsim_fib6_rt_create(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { struct fib6_info *rt = rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i = 0; int err; fib6_rt = kzalloc(sizeof(*fib6_rt), GFP_KERNEL); if (!fib6_rt) return ERR_PTR(-ENOMEM); nsim_fib_rt_init(data, &fib6_rt->common, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, rt->fib6_table->tb6_id); /* We consider a multipath IPv6 route as one entry, but it can be made * up from several fib6_info structs (one for each nexthop), so we * add them all to the same list under the entry. */ INIT_LIST_HEAD(&fib6_rt->nh_list); for (i = 0; i < nrt6; i++) { err = nsim_fib6_rt_nh_add(fib6_rt, rt_arr[i]); if (err) goto err_fib6_rt_nh_del; } return fib6_rt; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { nsim_fib6_rt_nh_del(fib6_rt, rt_arr[i]); } nsim_fib_rt_fini(&fib6_rt->common); kfree(fib6_rt); return ERR_PTR(err); } static void nsim_fib6_rt_destroy(struct nsim_fib6_rt *fib6_rt) { struct nsim_fib6_rt_nh *iter, *tmp; list_for_each_entry_safe(iter, tmp, &fib6_rt->nh_list, list) nsim_fib6_rt_nh_del(fib6_rt, iter->rt); WARN_ON_ONCE(!list_empty(&fib6_rt->nh_list)); nsim_fib_rt_fini(&fib6_rt->common); kfree(fib6_rt); } static struct nsim_fib6_rt * nsim_fib6_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib6_info *rt) { struct nsim_fib_rt *fib_rt; fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, rt->fib6_table->tb6_id); if (!fib_rt) return NULL; return container_of(fib_rt, struct nsim_fib6_rt, common); } static int nsim_fib6_rt_append(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i, err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt) return -EINVAL; for (i = 0; i < fib6_event->nrt6; i++) { err = nsim_fib6_rt_nh_add(fib6_rt, fib6_event->rt_arr[i]); if (err) goto err_fib6_rt_nh_del; WRITE_ONCE(fib6_event->rt_arr[i]->trap, true); } return 0; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { WRITE_ONCE(fib6_event->rt_arr[i]->trap, false); nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); } return err; } #if IS_ENABLED(CONFIG_IPV6) static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { struct net *net = devlink_net(data->devlink); int i; for (i = 0; i < nrt6; i++) fib6_info_hw_flags_set(net, rt_arr[i], false, false, true); } #else static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { } #endif #if IS_ENABLED(CONFIG_IPV6) static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, const struct nsim_fib6_rt *fib6_rt, bool trap) { struct net *net = devlink_net(data->devlink); struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap, false); } #else static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, const struct nsim_fib6_rt *fib6_rt, bool trap) { } #endif static int nsim_fib6_rt_add(struct nsim_fib_data *data, struct nsim_fib6_rt *fib6_rt) { int err; err = rhashtable_insert_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) goto err_fib_dismiss; msleep(1); nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); return 0; err_fib_dismiss: /* Drop the accounting that was increased from the notification * context when FIB_EVENT_ENTRY_REPLACE was triggered. */ nsim_fib_account(&data->ipv6.fib, false); return err; } static int nsim_fib6_rt_replace(struct nsim_fib_data *data, struct nsim_fib6_rt *fib6_rt, struct nsim_fib6_rt *fib6_rt_old) { int err; /* We are replacing a route, so need to remove the accounting which * was increased when FIB_EVENT_ENTRY_REPLACE was triggered. */ err = nsim_fib_account(&data->ipv6.fib, false); if (err) return err; err = rhashtable_replace_fast(&data->fib_rt_ht, &fib6_rt_old->common.ht_node, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) return err; msleep(1); nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); nsim_fib6_rt_hw_flags_set(data, fib6_rt_old, false); nsim_fib6_rt_destroy(fib6_rt_old); return 0; } static int nsim_fib6_rt_insert(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt, *fib6_rt_old; int err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib6_rt = nsim_fib6_rt_create(data, fib6_event->rt_arr, fib6_event->nrt6); if (IS_ERR(fib6_rt)) return PTR_ERR(fib6_rt); fib6_rt_old = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt_old) err = nsim_fib6_rt_add(data, fib6_rt); else err = nsim_fib6_rt_replace(data, fib6_rt, fib6_rt_old); if (err) nsim_fib6_rt_destroy(fib6_rt); return err; } static void nsim_fib6_rt_remove(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i; /* Multipath routes are first added to the FIB trie and only then * notified. If we vetoed the addition, we will get a delete * notification for a route we do not have. Therefore, do not warn if * route was not found. */ fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt) return; /* If not all the nexthops are deleted, then only reduce the nexthop * group. */ if (fib6_event->nrt6 != fib6_rt->nhs) { for (i = 0; i < fib6_event->nrt6; i++) nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); return; } rhashtable_remove_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); nsim_fib6_rt_destroy(fib6_rt); } static int nsim_fib6_event_init(struct nsim_fib6_event *fib6_event, struct fib6_entry_notifier_info *fen6_info) { struct fib6_info *rt = fen6_info->rt; struct fib6_info **rt_arr; struct fib6_info *iter; unsigned int nrt6; int i = 0; nrt6 = fen6_info->nsiblings + 1; rt_arr = kcalloc(nrt6, sizeof(struct fib6_info *), GFP_ATOMIC); if (!rt_arr) return -ENOMEM; fib6_event->rt_arr = rt_arr; fib6_event->nrt6 = nrt6; rt_arr[0] = rt; fib6_info_hold(rt); if (!fen6_info->nsiblings) return 0; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (i == fen6_info->nsiblings) break; rt_arr[i + 1] = iter; fib6_info_hold(iter); i++; } WARN_ON_ONCE(i != fen6_info->nsiblings); return 0; } static void nsim_fib6_event_fini(struct nsim_fib6_event *fib6_event) { int i; for (i = 0; i < fib6_event->nrt6; i++) nsim_rt6_release(fib6_event->rt_arr[i]); kfree(fib6_event->rt_arr); } static int nsim_fib6_event(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event, unsigned long event) { int err; if (fib6_event->rt_arr[0]->fib6_src.plen) return 0; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib6_rt_insert(data, fib6_event); if (err) goto err_rt_offload_failed_flag_set; break; case FIB_EVENT_ENTRY_APPEND: err = nsim_fib6_rt_append(data, fib6_event); if (err) goto err_rt_offload_failed_flag_set; break; case FIB_EVENT_ENTRY_DEL: nsim_fib6_rt_remove(data, fib6_event); break; default: break; } return 0; err_rt_offload_failed_flag_set: nsim_fib6_rt_offload_failed_flag_set(data, fib6_event->rt_arr, fib6_event->nrt6); return err; } static void nsim_fib_event(struct nsim_fib_event *fib_event) { switch (fib_event->family) { case AF_INET: nsim_fib4_event(fib_event->data, &fib_event->fen_info, fib_event->event); fib_info_put(fib_event->fen_info.fi); break; case AF_INET6: nsim_fib6_event(fib_event->data, &fib_event->fib6_event, fib_event->event); nsim_fib6_event_fini(&fib_event->fib6_event); break; } } static int nsim_fib4_prepare_event(struct fib_notifier_info *info, struct nsim_fib_event *fib_event, unsigned long event) { struct nsim_fib_data *data = fib_event->data; struct fib_entry_notifier_info *fen_info; struct netlink_ext_ack *extack; int err = 0; fen_info = container_of(info, struct fib_entry_notifier_info, info); fib_event->fen_info = *fen_info; extack = info->extack; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib_account(&data->ipv4.fib, true); if (err) { NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib entries"); return err; } break; case FIB_EVENT_ENTRY_DEL: if (data->fail_route_delete) { NL_SET_ERR_MSG_MOD(extack, "Failed to process route deletion"); return -EINVAL; } nsim_fib_account(&data->ipv4.fib, false); break; } /* Take reference on fib_info to prevent it from being * freed while event is queued. Release it afterwards. */ fib_info_hold(fib_event->fen_info.fi); return 0; } static int nsim_fib6_prepare_event(struct fib_notifier_info *info, struct nsim_fib_event *fib_event, unsigned long event) { struct nsim_fib_data *data = fib_event->data; struct fib6_entry_notifier_info *fen6_info; struct netlink_ext_ack *extack; int err = 0; fen6_info = container_of(info, struct fib6_entry_notifier_info, info); err = nsim_fib6_event_init(&fib_event->fib6_event, fen6_info); if (err) return err; extack = info->extack; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib_account(&data->ipv6.fib, true); if (err) { NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib entries"); goto err_fib6_event_fini; } break; case FIB_EVENT_ENTRY_DEL: if (data->fail_route_delete) { err = -EINVAL; NL_SET_ERR_MSG_MOD(extack, "Failed to process route deletion"); goto err_fib6_event_fini; } nsim_fib_account(&data->ipv6.fib, false); break; } return 0; err_fib6_event_fini: nsim_fib6_event_fini(&fib_event->fib6_event); return err; } static int nsim_fib_event_schedule_work(struct nsim_fib_data *data, struct fib_notifier_info *info, unsigned long event) { struct nsim_fib_event *fib_event; int err; if (info->family != AF_INET && info->family != AF_INET6) /* netdevsim does not support 'RTNL_FAMILY_IP6MR' and * 'RTNL_FAMILY_IPMR' and should ignore them. */ return NOTIFY_DONE; fib_event = kzalloc(sizeof(*fib_event), GFP_ATOMIC); if (!fib_event) goto err_fib_event_alloc; fib_event->data = data; fib_event->event = event; fib_event->family = info->family; switch (info->family) { case AF_INET: err = nsim_fib4_prepare_event(info, fib_event, event); break; case AF_INET6: err = nsim_fib6_prepare_event(info, fib_event, event); break; } if (err) goto err_fib_prepare_event; /* Enqueue the event and trigger the work */ spin_lock_bh(&data->fib_event_queue_lock); list_add_tail(&fib_event->list, &data->fib_event_queue); spin_unlock_bh(&data->fib_event_queue_lock); schedule_work(&data->fib_event_work); return NOTIFY_DONE; err_fib_prepare_event: kfree(fib_event); err_fib_event_alloc: if (event == FIB_EVENT_ENTRY_DEL) schedule_work(&data->fib_flush_work); return NOTIFY_BAD; } static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, void *ptr) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); struct fib_notifier_info *info = ptr; int err; switch (event) { case FIB_EVENT_RULE_ADD: case FIB_EVENT_RULE_DEL: err = nsim_fib_rule_event(data, info, event == FIB_EVENT_RULE_ADD); return notifier_from_errno(err); case FIB_EVENT_ENTRY_REPLACE: case FIB_EVENT_ENTRY_APPEND: case FIB_EVENT_ENTRY_DEL: return nsim_fib_event_schedule_work(data, info, event); } return NOTIFY_DONE; } static void nsim_fib4_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib_data *data) { struct devlink *devlink = data->devlink; struct nsim_fib4_rt *fib4_rt; fib4_rt = container_of(fib_rt, struct nsim_fib4_rt, common); nsim_fib4_rt_hw_flags_set(devlink_net(devlink), fib4_rt, false); nsim_fib_account(&data->ipv4.fib, false); nsim_fib4_rt_destroy(fib4_rt); } static void nsim_fib6_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib_data *data) { struct nsim_fib6_rt *fib6_rt; fib6_rt = container_of(fib_rt, struct nsim_fib6_rt, common); nsim_fib6_rt_hw_flags_set(data, fib6_rt, false); nsim_fib_account(&data->ipv6.fib, false); nsim_fib6_rt_destroy(fib6_rt); } static void nsim_fib_rt_free(void *ptr, void *arg) { struct nsim_fib_rt *fib_rt = ptr; struct nsim_fib_data *data = arg; switch (fib_rt->key.family) { case AF_INET: nsim_fib4_rt_free(fib_rt, data); break; case AF_INET6: nsim_fib6_rt_free(fib_rt, data); break; default: WARN_ON_ONCE(1); } } /* inconsistent dump, trying again */ static void nsim_fib_dump_inconsistent(struct notifier_block *nb) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); struct nsim_fib_rt *fib_rt, *fib_rt_tmp; /* Flush the work to make sure there is no race with notifications. */ flush_work(&data->fib_event_work); /* The notifier block is still not registered, so we do not need to * take any locks here. */ list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, nsim_fib_rt_ht_params); nsim_fib_rt_free(fib_rt, data); } atomic64_set(&data->ipv4.rules.num, 0ULL); atomic64_set(&data->ipv6.rules.num, 0ULL); } static struct nsim_nexthop *nsim_nexthop_create(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop; u64 occ = 0; int i; nexthop = kzalloc(sizeof(*nexthop), GFP_KERNEL); if (!nexthop) return ERR_PTR(-ENOMEM); nexthop->id = info->id; /* Determine the number of nexthop entries the new nexthop will * occupy. */ switch (info->type) { case NH_NOTIFIER_INFO_TYPE_SINGLE: occ = 1; break; case NH_NOTIFIER_INFO_TYPE_GRP: for (i = 0; i < info->nh_grp->num_nh; i++) occ += info->nh_grp->nh_entries[i].weight; break; case NH_NOTIFIER_INFO_TYPE_RES_TABLE: occ = info->nh_res_table->num_nh_buckets; nexthop->is_resilient = true; break; default: NL_SET_ERR_MSG_MOD(info->extack, "Unsupported nexthop type"); kfree(nexthop); return ERR_PTR(-EOPNOTSUPP); } nexthop->occ = occ; return nexthop; } static void nsim_nexthop_destroy(struct nsim_nexthop *nexthop) { kfree(nexthop); } static int nsim_nexthop_account(struct nsim_fib_data *data, u64 occ, bool add, struct netlink_ext_ack *extack) { int i, err = 0; if (add) { for (i = 0; i < occ; i++) if (!atomic64_add_unless(&data->nexthops.num, 1, data->nexthops.max)) { err = -ENOSPC; NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported nexthops"); goto err_num_decrease; } } else { if (WARN_ON(occ > atomic64_read(&data->nexthops.num))) return -EINVAL; atomic64_sub(occ, &data->nexthops.num); } return err; err_num_decrease: atomic64_sub(i, &data->nexthops.num); return err; } static void nsim_nexthop_hw_flags_set(struct net *net, const struct nsim_nexthop *nexthop, bool trap) { int i; nexthop_set_hw_flags(net, nexthop->id, false, trap); if (!nexthop->is_resilient) return; for (i = 0; i < nexthop->occ; i++) nexthop_bucket_set_hw_flags(net, nexthop->id, i, false, trap); } static int nsim_nexthop_add(struct nsim_fib_data *data, struct nsim_nexthop *nexthop, struct netlink_ext_ack *extack) { struct net *net = devlink_net(data->devlink); int err; err = nsim_nexthop_account(data, nexthop->occ, true, extack); if (err) return err; err = rhashtable_insert_fast(&data->nexthop_ht, &nexthop->ht_node, nsim_nexthop_ht_params); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to insert nexthop"); goto err_nexthop_dismiss; } nsim_nexthop_hw_flags_set(net, nexthop, true); return 0; err_nexthop_dismiss: nsim_nexthop_account(data, nexthop->occ, false, extack); return err; } static int nsim_nexthop_replace(struct nsim_fib_data *data, struct nsim_nexthop *nexthop, struct nsim_nexthop *nexthop_old, struct netlink_ext_ack *extack) { struct net *net = devlink_net(data->devlink); int err; err = nsim_nexthop_account(data, nexthop->occ, true, extack); if (err) return err; err = rhashtable_replace_fast(&data->nexthop_ht, &nexthop_old->ht_node, &nexthop->ht_node, nsim_nexthop_ht_params); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to replace nexthop"); goto err_nexthop_dismiss; } nsim_nexthop_hw_flags_set(net, nexthop, true); nsim_nexthop_account(data, nexthop_old->occ, false, extack); nsim_nexthop_destroy(nexthop_old); return 0; err_nexthop_dismiss: nsim_nexthop_account(data, nexthop->occ, false, extack); return err; } static int nsim_nexthop_insert(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop, *nexthop_old; int err; nexthop = nsim_nexthop_create(data, info); if (IS_ERR(nexthop)) return PTR_ERR(nexthop); nexthop_old = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); if (!nexthop_old) err = nsim_nexthop_add(data, nexthop, info->extack); else err = nsim_nexthop_replace(data, nexthop, nexthop_old, info->extack); if (err) nsim_nexthop_destroy(nexthop); return err; } static void nsim_nexthop_remove(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop; nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); if (!nexthop) return; rhashtable_remove_fast(&data->nexthop_ht, &nexthop->ht_node, nsim_nexthop_ht_params); nsim_nexthop_account(data, nexthop->occ, false, info->extack); nsim_nexthop_destroy(nexthop); } static int nsim_nexthop_res_table_pre_replace(struct nsim_fib_data *data, struct nh_notifier_info *info) { if (data->fail_res_nexthop_group_replace) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace a resilient nexthop group"); return -EINVAL; } return 0; } static int nsim_nexthop_bucket_replace(struct nsim_fib_data *data, struct nh_notifier_info *info) { if (data->fail_nexthop_bucket_replace) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace nexthop bucket"); return -EINVAL; } nexthop_bucket_set_hw_flags(info->net, info->id, info->nh_res_bucket->bucket_index, false, true); return 0; } static int nsim_nexthop_event_nb(struct notifier_block *nb, unsigned long event, void *ptr) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, nexthop_nb); struct nh_notifier_info *info = ptr; int err = 0; mutex_lock(&data->nh_lock); switch (event) { case NEXTHOP_EVENT_REPLACE: err = nsim_nexthop_insert(data, info); break; case NEXTHOP_EVENT_DEL: nsim_nexthop_remove(data, info); break; case NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE: err = nsim_nexthop_res_table_pre_replace(data, info); break; case NEXTHOP_EVENT_BUCKET_REPLACE: err = nsim_nexthop_bucket_replace(data, info); break; default: break; } mutex_unlock(&data->nh_lock); return notifier_from_errno(err); } static void nsim_nexthop_free(void *ptr, void *arg) { struct nsim_nexthop *nexthop = ptr; struct nsim_fib_data *data = arg; struct net *net; net = devlink_net(data->devlink); nsim_nexthop_hw_flags_set(net, nexthop, false); nsim_nexthop_account(data, nexthop->occ, false, NULL); nsim_nexthop_destroy(nexthop); } static ssize_t nsim_nexthop_bucket_activity_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) { struct nsim_fib_data *data = file->private_data; struct net *net = devlink_net(data->devlink); struct nsim_nexthop *nexthop; unsigned long *activity; loff_t pos = *ppos; u16 bucket_index; char buf[128]; int err = 0; u32 nhid; if (pos != 0) return -EINVAL; if (size > sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, size)) return -EFAULT; if (sscanf(buf, "%u %hu", &nhid, &bucket_index) != 2) return -EINVAL; rtnl_lock(); nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &nhid, nsim_nexthop_ht_params); if (!nexthop || !nexthop->is_resilient || bucket_index >= nexthop->occ) { err = -EINVAL; goto out; } activity = bitmap_zalloc(nexthop->occ, GFP_KERNEL); if (!activity) { err = -ENOMEM; goto out; } bitmap_set(activity, bucket_index, 1); nexthop_res_grp_activity_update(net, nhid, nexthop->occ, activity); bitmap_free(activity); out: rtnl_unlock(); *ppos = size; return err ?: size; } static const struct file_operations nsim_nexthop_bucket_activity_fops = { .open = simple_open, .write = nsim_nexthop_bucket_activity_write, .llseek = no_llseek, .owner = THIS_MODULE, }; static u64 nsim_fib_ipv4_resource_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV4_FIB, false); } static u64 nsim_fib_ipv4_rules_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV4_FIB_RULES, false); } static u64 nsim_fib_ipv6_resource_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV6_FIB, false); } static u64 nsim_fib_ipv6_rules_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV6_FIB_RULES, false); } static u64 nsim_fib_nexthops_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_NEXTHOPS, false); } static void nsim_fib_set_max_all(struct nsim_fib_data *data, struct devlink *devlink) { static const enum nsim_resource_id res_ids[] = { NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4_FIB_RULES, NSIM_RESOURCE_IPV6_FIB, NSIM_RESOURCE_IPV6_FIB_RULES, NSIM_RESOURCE_NEXTHOPS, }; int i; for (i = 0; i < ARRAY_SIZE(res_ids); i++) { int err; u64 val; err = devl_resource_size_get(devlink, res_ids[i], &val); if (err) val = (u64) -1; nsim_fib_set_max(data, res_ids[i], val); } } static void nsim_fib_event_work(struct work_struct *work) { struct nsim_fib_data *data = container_of(work, struct nsim_fib_data, fib_event_work); struct nsim_fib_event *fib_event, *next_fib_event; LIST_HEAD(fib_event_queue); spin_lock_bh(&data->fib_event_queue_lock); list_splice_init(&data->fib_event_queue, &fib_event_queue); spin_unlock_bh(&data->fib_event_queue_lock); mutex_lock(&data->fib_lock); list_for_each_entry_safe(fib_event, next_fib_event, &fib_event_queue, list) { nsim_fib_event(fib_event); list_del(&fib_event->list); kfree(fib_event); cond_resched(); } mutex_unlock(&data->fib_lock); } static void nsim_fib_flush_work(struct work_struct *work) { struct nsim_fib_data *data = container_of(work, struct nsim_fib_data, fib_flush_work); struct nsim_fib_rt *fib_rt, *fib_rt_tmp; /* Process pending work. */ flush_work(&data->fib_event_work); mutex_lock(&data->fib_lock); list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, nsim_fib_rt_ht_params); nsim_fib_rt_free(fib_rt, data); } mutex_unlock(&data->fib_lock); } static int nsim_fib_debugfs_init(struct nsim_fib_data *data, struct nsim_dev *nsim_dev) { data->ddir = debugfs_create_dir("fib", nsim_dev->ddir); if (IS_ERR(data->ddir)) return PTR_ERR(data->ddir); data->fail_route_offload = false; debugfs_create_bool("fail_route_offload", 0600, data->ddir, &data->fail_route_offload); data->fail_res_nexthop_group_replace = false; debugfs_create_bool("fail_res_nexthop_group_replace", 0600, data->ddir, &data->fail_res_nexthop_group_replace); data->fail_nexthop_bucket_replace = false; debugfs_create_bool("fail_nexthop_bucket_replace", 0600, data->ddir, &data->fail_nexthop_bucket_replace); debugfs_create_file("nexthop_bucket_activity", 0200, data->ddir, data, &nsim_nexthop_bucket_activity_fops); data->fail_route_delete = false; debugfs_create_bool("fail_route_delete", 0600, data->ddir, &data->fail_route_delete); return 0; } static void nsim_fib_debugfs_exit(struct nsim_fib_data *data) { debugfs_remove_recursive(data->ddir); } struct nsim_fib_data *nsim_fib_create(struct devlink *devlink, struct netlink_ext_ack *extack) { struct nsim_fib_data *data; struct nsim_dev *nsim_dev; int err; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); data->devlink = devlink; nsim_dev = devlink_priv(devlink); err = nsim_fib_debugfs_init(data, nsim_dev); if (err) goto err_data_free; mutex_init(&data->nh_lock); err = rhashtable_init(&data->nexthop_ht, &nsim_nexthop_ht_params); if (err) goto err_debugfs_exit; mutex_init(&data->fib_lock); INIT_LIST_HEAD(&data->fib_rt_list); err = rhashtable_init(&data->fib_rt_ht, &nsim_fib_rt_ht_params); if (err) goto err_rhashtable_nexthop_destroy; INIT_WORK(&data->fib_event_work, nsim_fib_event_work); INIT_WORK(&data->fib_flush_work, nsim_fib_flush_work); INIT_LIST_HEAD(&data->fib_event_queue); spin_lock_init(&data->fib_event_queue_lock); nsim_fib_set_max_all(data, devlink); data->nexthop_nb.notifier_call = nsim_nexthop_event_nb; err = register_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb, extack); if (err) { pr_err("Failed to register nexthop notifier\n"); goto err_rhashtable_fib_destroy; } data->fib_nb.notifier_call = nsim_fib_event_nb; err = register_fib_notifier(devlink_net(devlink), &data->fib_nb, nsim_fib_dump_inconsistent, extack); if (err) { pr_err("Failed to register fib notifier\n"); goto err_nexthop_nb_unregister; } devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV4_FIB, nsim_fib_ipv4_resource_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV4_FIB_RULES, nsim_fib_ipv4_rules_res_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV6_FIB, nsim_fib_ipv6_resource_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV6_FIB_RULES, nsim_fib_ipv6_rules_res_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_NEXTHOPS, nsim_fib_nexthops_res_occ_get, data); return data; err_nexthop_nb_unregister: unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb); err_rhashtable_fib_destroy: cancel_work_sync(&data->fib_flush_work); flush_work(&data->fib_event_work); rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, data); err_rhashtable_nexthop_destroy: rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free, data); mutex_destroy(&data->fib_lock); err_debugfs_exit: mutex_destroy(&data->nh_lock); nsim_fib_debugfs_exit(data); err_data_free: kfree(data); return ERR_PTR(err); } void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data) { devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_NEXTHOPS); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV6_FIB_RULES); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV6_FIB); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB_RULES); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB); unregister_fib_notifier(devlink_net(devlink), &data->fib_nb); unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb); cancel_work_sync(&data->fib_flush_work); flush_work(&data->fib_event_work); rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, data); rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free, data); WARN_ON_ONCE(!list_empty(&data->fib_event_queue)); WARN_ON_ONCE(!list_empty(&data->fib_rt_list)); mutex_destroy(&data->fib_lock); mutex_destroy(&data->nh_lock); nsim_fib_debugfs_exit(data); kfree(data); } |
| 132 132 132 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> * Copyright (C) 2002 Andi Kleen * * This handles calls from both 32bit and 64bit mode. * * Lock order: * context.ldt_usr_sem * mmap_lock * context.lock */ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <asm/ldt.h> #include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> #include <asm/pgtable_areas.h> #include <xen/xen.h> /* This is a multiple of PAGE_SIZE. */ #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) static inline void *ldt_slot_va(int slot) { return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); } void load_mm_ldt(struct mm_struct *mm) { struct ldt_struct *ldt; /* READ_ONCE synchronizes with smp_store_release */ ldt = READ_ONCE(mm->context.ldt); /* * Any change to mm->context.ldt is followed by an IPI to all * CPUs with the mm active. The LDT will not be freed until * after the IPI is handled by all such CPUs. This means that * if the ldt_struct changes before we return, the values we see * will be safe, and the new values will be loaded before we run * any user code. * * NB: don't try to convert this to use RCU without extreme care. * We would still need IRQs off, because we don't want to change * the local LDT after an IPI loaded a newer value than the one * that we can see. */ if (unlikely(ldt)) { if (static_cpu_has(X86_FEATURE_PTI)) { if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { /* * Whoops -- either the new LDT isn't mapped * (if slot == -1) or is mapped into a bogus * slot (if slot > 1). */ clear_LDT(); return; } /* * If page table isolation is enabled, ldt->entries * will not be mapped in the userspace pagetables. * Tell the CPU to access the LDT through the alias * at ldt_slot_va(ldt->slot). */ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); } else { set_ldt(ldt->entries, ldt->nr_entries); } } else { clear_LDT(); } } void switch_ldt(struct mm_struct *prev, struct mm_struct *next) { /* * Load the LDT if either the old or new mm had an LDT. * * An mm will never go from having an LDT to not having an LDT. Two * mms never share an LDT, so we don't gain anything by checking to * see whether the LDT changed. There's also no guarantee that * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, * then prev->context.ldt will also be non-NULL. * * If we really cared, we could optimize the case where prev == next * and we're exiting lazy mode. Most of the time, if this happens, * we don't actually need to reload LDTR, but modify_ldt() is mostly * used by legacy code and emulators where we don't need this level of * performance. * * This uses | instead of || because it generates better code. */ if (unlikely((unsigned long)prev->context.ldt | (unsigned long)next->context.ldt)) load_mm_ldt(next); DEBUG_LOCKS_WARN_ON(preemptible()); } static void refresh_ldt_segments(void) { #ifdef CONFIG_X86_64 unsigned short sel; /* * Make sure that the cached DS and ES descriptors match the updated * LDT. */ savesegment(ds, sel); if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) loadsegment(ds, sel); savesegment(es, sel); if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) loadsegment(es, sel); #endif } /* context.lock is held by the task which issued the smp function call */ static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; load_mm_ldt(mm); refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) { struct ldt_struct *new_ldt; unsigned int alloc_size; if (num_entries > LDT_ENTRIES) return NULL; new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); alloc_size = num_entries * LDT_ENTRY_SIZE; /* * Xen is very picky: it requires a page-aligned LDT that has no * trailing nonzero bytes in any page that contains LDT descriptors. * Keep it simple: zero the whole allocation and never allocate less * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); return NULL; } /* The new LDT isn't aliased for PTI yet. */ new_ldt->slot = -1; new_ldt->nr_entries = num_entries; return new_ldt; } #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void do_sanity_check(struct mm_struct *mm, bool had_kernel_mapping, bool had_user_mapping) { if (mm->context.ldt) { /* * We already had an LDT. The top-level entry should already * have been allocated and synchronized with the usermode * tables. */ WARN_ON(!had_kernel_mapping); if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(!had_user_mapping); } else { /* * This is the first time we're mapping an LDT for this process. * Sync the pgd to the usermode tables. */ WARN_ON(had_kernel_mapping); if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(had_user_mapping); } } #ifdef CONFIG_X86_PAE static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) { p4d_t *p4d; pud_t *pud; if (pgd->pgd == 0) return NULL; p4d = p4d_offset(pgd, va); if (p4d_none(*p4d)) return NULL; pud = pud_offset(p4d, va); if (pud_none(*pud)) return NULL; return pmd_offset(pud, va); } static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); pmd_t *k_pmd, *u_pmd; k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pmd(u_pmd, *k_pmd); } static void sanity_check_ldt_mapping(struct mm_struct *mm) { pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); bool had_kernel, had_user; pmd_t *k_pmd, *u_pmd; k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); had_kernel = (k_pmd->pmd != 0); had_user = (u_pmd->pmd != 0); do_sanity_check(mm, had_kernel, had_user); } #else /* !CONFIG_X86_PAE */ static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pgd(kernel_to_user_pgdp(pgd), *pgd); } static void sanity_check_ldt_mapping(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); bool had_kernel = (pgd->pgd != 0); bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); do_sanity_check(mm, had_kernel, had_user); } #endif /* CONFIG_X86_PAE */ /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { unsigned long va; bool is_vmalloc; spinlock_t *ptl; int i, nr_pages; if (!boot_cpu_has(X86_FEATURE_PTI)) return 0; /* * Any given ldt_struct should have map_ldt_struct() called at most * once. */ WARN_ON(ldt->slot != -1); /* Check if the current mappings are sane */ sanity_check_ldt_mapping(mm); is_vmalloc = is_vmalloc_addr(ldt->entries); nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); for (i = 0; i < nr_pages; i++) { unsigned long offset = i << PAGE_SHIFT; const void *src = (char *)ldt->entries + offset; unsigned long pfn; pgprot_t pte_prot; pte_t pte, *ptep; va = (unsigned long)ldt_slot_va(slot) + offset; pfn = is_vmalloc ? vmalloc_to_pfn(src) : page_to_pfn(virt_to_page(src)); /* * Treat the PTI LDT range as a *userspace* range. * get_locked_pte() will allocate all needed pagetables * and account for them in this mm. */ ptep = get_locked_pte(mm, va, &ptl); if (!ptep) return -ENOMEM; /* * Map it RO so the easy to find address is not a primary * target via some kernel interface which misses a * permission check. */ pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL); /* Filter out unsuppored __PAGE_KERNEL* bits: */ pgprot_val(pte_prot) &= __supported_pte_mask; pte = pfn_pte(pfn, pte_prot); set_pte_at(mm, va, ptep, pte); pte_unmap_unlock(ptep, ptl); } /* Propagate LDT mapping to the user page-table */ map_ldt_struct_to_user(mm); ldt->slot = slot; return 0; } static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { unsigned long va; int i, nr_pages; if (!ldt) return; /* LDT map/unmap is only required for PTI */ if (!boot_cpu_has(X86_FEATURE_PTI)) return; nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); for (i = 0; i < nr_pages; i++) { unsigned long offset = i << PAGE_SHIFT; spinlock_t *ptl; pte_t *ptep; va = (unsigned long)ldt_slot_va(ldt->slot) + offset; ptep = get_locked_pte(mm, va, &ptl); if (!WARN_ON_ONCE(!ptep)) { pte_clear(mm, va, ptep); pte_unmap_unlock(ptep, ptl); } } va = (unsigned long)ldt_slot_va(ldt->slot); flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); } #else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { return 0; } static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { } #endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static void free_ldt_pgtables(struct mm_struct *mm) { #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; if (!boot_cpu_has(X86_FEATURE_PTI)) return; /* * Although free_pgd_range() is intended for freeing user * page-tables, it also works out for kernel mappings on x86. * We use tlb_gather_mmu_fullmm() to avoid confusing the * range-tracking logic in __tlb_adjust_range(). */ tlb_gather_mmu_fullmm(&tlb, mm); free_pgd_range(&tlb, start, end, start, end); tlb_finish_mmu(&tlb); #endif } /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) { mutex_lock(&mm->context.lock); /* Synchronizes with READ_ONCE in load_mm_ldt. */ smp_store_release(&mm->context.ldt, ldt); /* Activate the LDT for all CPUs using currents mm. */ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); mutex_unlock(&mm->context.lock); } static void free_ldt_struct(struct ldt_struct *ldt) { if (likely(!ldt)) return; paravirt_free_ldt(ldt->entries, ldt->nr_entries); if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); kfree(ldt); } /* * Called on fork from arch_dup_mmap(). Just copy the current LDT state, * the new task is not running, so nothing can be installed. */ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) { struct ldt_struct *new_ldt; int retval = 0; if (!old_mm) return 0; mutex_lock(&old_mm->context.lock); if (!old_mm->context.ldt) goto out_unlock; new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { retval = -ENOMEM; goto out_unlock; } memcpy(new_ldt->entries, old_mm->context.ldt->entries, new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); retval = map_ldt_struct(mm, new_ldt, 0); if (retval) { free_ldt_pgtables(mm); free_ldt_struct(new_ldt); goto out_unlock; } mm->context.ldt = new_ldt; out_unlock: mutex_unlock(&old_mm->context.lock); return retval; } /* * No need to lock the MM as we are the last user * * 64bit: Don't touch the LDT register - we're already in the next thread. */ void destroy_context_ldt(struct mm_struct *mm) { free_ldt_struct(mm->context.ldt); mm->context.ldt = NULL; } void ldt_arch_exit_mmap(struct mm_struct *mm) { free_ldt_pgtables(mm); } static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; unsigned long entries_size; int retval; down_read(&mm->context.ldt_usr_sem); if (!mm->context.ldt) { retval = 0; goto out_unlock; } if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; if (entries_size > bytecount) entries_size = bytecount; if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { retval = -EFAULT; goto out_unlock; } if (entries_size != bytecount) { /* Zero-fill the rest and pretend we read bytecount bytes. */ if (clear_user(ptr + entries_size, bytecount - entries_size)) { retval = -EFAULT; goto out_unlock; } } retval = bytecount; out_unlock: up_read(&mm->context.ldt_usr_sem); return retval; } static int read_default_ldt(void __user *ptr, unsigned long bytecount) { /* CHECKME: Can we use _one_ random number ? */ #ifdef CONFIG_X86_32 unsigned long size = 5 * sizeof(struct desc_struct); #else unsigned long size = 128; #endif if (bytecount > size) bytecount = size; if (clear_user(ptr, bytecount)) return -EFAULT; return bytecount; } static bool allow_16bit_segments(void) { if (!IS_ENABLED(CONFIG_X86_16BIT)) return false; #ifdef CONFIG_XEN_PV /* * Xen PV does not implement ESPFIX64, which means that 16-bit * segments will not work correctly. Until either Xen PV implements * ESPFIX64 and can signal this fact to the guest or unless someone * provides compelling evidence that allowing broken 16-bit segments * is worthwhile, disallow 16-bit segments under Xen PV. */ if (xen_pv_domain()) { pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); return false; } #endif return true; } static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; struct ldt_struct *new_ldt, *old_ldt; unsigned int old_nr_entries, new_nr_entries; struct user_desc ldt_info; struct desc_struct ldt; int error; error = -EINVAL; if (bytecount != sizeof(ldt_info)) goto out; error = -EFAULT; if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) goto out; error = -EINVAL; if (ldt_info.entry_number >= LDT_ENTRIES) goto out; if (ldt_info.contents == 3) { if (oldmode) goto out; if (ldt_info.seg_not_present == 0) goto out; } if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) || LDT_empty(&ldt_info)) { /* The user wants to clear the entry. */ memset(&ldt, 0, sizeof(ldt)); } else { if (!ldt_info.seg_32bit && !allow_16bit_segments()) { error = -EINVAL; goto out; } fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; } if (down_write_killable(&mm->context.ldt_usr_sem)) return -EINTR; old_ldt = mm->context.ldt; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); error = -ENOMEM; new_ldt = alloc_ldt_struct(new_nr_entries); if (!new_ldt) goto out_unlock; if (old_ldt) memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); /* * If we are using PTI, map the new LDT into the userspace pagetables. * If there is already an LDT, use the other slot so that other CPUs * will continue to use the old LDT until install_ldt() switches * them over to the new LDT. */ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); if (error) { /* * This only can fail for the first LDT setup. If an LDT is * already installed then the PTE page is already * populated. Mop up a half populated page table. */ if (!WARN_ON_ONCE(old_ldt)) free_ldt_pgtables(mm); free_ldt_struct(new_ldt); goto out_unlock; } install_ldt(mm, new_ldt); unmap_ldt_struct(mm, old_ldt); free_ldt_struct(old_ldt); error = 0; out_unlock: up_write(&mm->context.ldt_usr_sem); out: return error; } SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , unsigned long , bytecount) { int ret = -ENOSYS; switch (func) { case 0: ret = read_ldt(ptr, bytecount); break; case 1: ret = write_ldt(ptr, bytecount, 1); break; case 2: ret = read_default_ldt(ptr, bytecount); break; case 0x11: ret = write_ldt(ptr, bytecount, 0); break; } /* * The SYSCALL_DEFINE() macros give us an 'unsigned long' * return type, but the ABI for sys_modify_ldt() expects * 'int'. This cast gives us an int-sized value in %rax * for the return code. The 'unsigned' is necessary so * the compiler does not try to sign-extend the negative * return codes into the high half of the register when * taking the value from int->long. */ return (unsigned int)ret; } |
| 4 3 3 3 12 12 2 2 10 11 1 13 20 20 20 14 14 1 13 13 13 3 10 13 31 31 10 1 1 1 1 1 1 2 2 1 1 1 1 3 3 3 3 3 1 4 8 6 1 36 37 37 2 2 2 2 2 2 1 4 2 19 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org> * Copyright 2001-2006 Ian Kent <raven@themaw.net> */ #include <linux/capability.h> #include <linux/compat.h> #include "autofs_i.h" static int autofs_dir_permission(struct mnt_idmap *, struct inode *, int); static int autofs_dir_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static long autofs_root_compat_ioctl(struct file *, unsigned int, unsigned long); #endif static int autofs_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs_lookup(struct inode *, struct dentry *, unsigned int); static struct vfsmount *autofs_d_automount(struct path *); static int autofs_d_manage(const struct path *, bool); static void autofs_dentry_release(struct dentry *); const struct file_operations autofs_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, .unlocked_ioctl = autofs_root_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = autofs_root_compat_ioctl, #endif }; const struct file_operations autofs_dir_operations = { .open = autofs_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, }; const struct inode_operations autofs_dir_inode_operations = { .lookup = autofs_lookup, .permission = autofs_dir_permission, .unlink = autofs_dir_unlink, .symlink = autofs_dir_symlink, .mkdir = autofs_dir_mkdir, .rmdir = autofs_dir_rmdir, }; const struct dentry_operations autofs_dentry_operations = { .d_automount = autofs_d_automount, .d_manage = autofs_d_manage, .d_release = autofs_dentry_release, }; static void autofs_del_active(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; ino = autofs_dentry_ino(dentry); spin_lock(&sbi->lookup_lock); list_del_init(&ino->active); spin_unlock(&sbi->lookup_lock); } static int autofs_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); if (autofs_oz_mode(sbi)) goto out; /* * An empty directory in an autofs file system is always a * mount point. The daemon must have failed to mount this * during lookup so it doesn't exist. This can happen, for * example, if user space returns an incorrect status for a * mount request. Otherwise we're doing a readdir on the * autofs file system so just let the libfs routines handle * it. */ spin_lock(&sbi->lookup_lock); if (!path_is_mountpoint(&file->f_path) && autofs_empty(ino)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } spin_unlock(&sbi->lookup_lock); out: return dcache_dir_open(inode, file); } static void autofs_dentry_release(struct dentry *de) { struct autofs_info *ino = autofs_dentry_ino(de); struct autofs_sb_info *sbi = autofs_sbi(de->d_sb); pr_debug("releasing %p\n", de); if (!ino) return; if (sbi) { spin_lock(&sbi->lookup_lock); if (!list_empty(&ino->active)) list_del(&ino->active); if (!list_empty(&ino->expiring)) list_del(&ino->expiring); spin_unlock(&sbi->lookup_lock); } autofs_free_ino(ino); } static struct dentry *autofs_lookup_active(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; struct list_head *p, *head; head = &sbi->active_list; if (list_empty(head)) return NULL; spin_lock(&sbi->lookup_lock); list_for_each(p, head) { struct autofs_info *ino; struct dentry *active; const struct qstr *qstr; ino = list_entry(p, struct autofs_info, active); active = ino->dentry; spin_lock(&active->d_lock); /* Already gone? */ if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; if (active->d_name.hash != hash) goto next; if (active->d_parent != parent) goto next; if (qstr->len != len) goto next; if (memcmp(qstr->name, str, len)) goto next; if (d_unhashed(active)) { dget_dlock(active); spin_unlock(&active->d_lock); spin_unlock(&sbi->lookup_lock); return active; } next: spin_unlock(&active->d_lock); } spin_unlock(&sbi->lookup_lock); return NULL; } static struct dentry *autofs_lookup_expiring(struct dentry *dentry, bool rcu_walk) { struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; unsigned int hash = name->hash; const unsigned char *str = name->name; struct list_head *p, *head; head = &sbi->expiring_list; if (list_empty(head)) return NULL; spin_lock(&sbi->lookup_lock); list_for_each(p, head) { struct autofs_info *ino; struct dentry *expiring; const struct qstr *qstr; if (rcu_walk) { spin_unlock(&sbi->lookup_lock); return ERR_PTR(-ECHILD); } ino = list_entry(p, struct autofs_info, expiring); expiring = ino->dentry; spin_lock(&expiring->d_lock); /* We've already been dentry_iput or unlinked */ if (d_really_is_negative(expiring)) goto next; qstr = &expiring->d_name; if (expiring->d_name.hash != hash) goto next; if (expiring->d_parent != parent) goto next; if (qstr->len != len) goto next; if (memcmp(qstr->name, str, len)) goto next; if (d_unhashed(expiring)) { dget_dlock(expiring); spin_unlock(&expiring->d_lock); spin_unlock(&sbi->lookup_lock); return expiring; } next: spin_unlock(&expiring->d_lock); } spin_unlock(&sbi->lookup_lock); return NULL; } static int autofs_mount_wait(const struct path *path, bool rcu_walk) { struct autofs_sb_info *sbi = autofs_sbi(path->dentry->d_sb); struct autofs_info *ino = autofs_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; pr_debug("waiting for mount name=%pd\n", path->dentry); status = autofs_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); ino->last_used = jiffies; return status; } if (!(sbi->flags & AUTOFS_SBI_STRICTEXPIRE)) ino->last_used = jiffies; return status; } static int do_expire_wait(const struct path *path, bool rcu_walk) { struct dentry *dentry = path->dentry; struct dentry *expiring; expiring = autofs_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) return autofs_expire_wait(path, rcu_walk); else { const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* * If we are racing with expire the request might not * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ autofs_expire_wait(&this, 0); autofs_del_expiring(expiring); dput(expiring); } return 0; } static struct dentry *autofs_mountpoint_changed(struct path *path) { struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); /* If this is an indirect mount the dentry could have gone away * and a new one created. * * This is unusual and I can't remember the case for which it * was originally added now. But an example of how this can * happen is an autofs indirect mount that has the "browse" * option set and also has the "symlink" option in the autofs * map entry. In this case the daemon will remove the browse * directory and create a symlink as the mount leaving the * struct path stale. * * Another not so obvious case is when a mount in an autofs * indirect mount that uses the "nobrowse" option is being * expired at the same time as a path walk. If the mount has * been umounted but the mount point directory seen before * becoming unhashed (during a lockless path walk) when a stat * family system call is made the mount won't be re-mounted as * it should. In this case the mount point that's been removed * (by the daemon) will be stale and the a new mount point * dentry created. */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; struct autofs_info *ino; struct dentry *new; new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; ino = autofs_dentry_ino(new); ino->last_used = jiffies; dput(path->dentry); path->dentry = new; } return path->dentry; } static struct vfsmount *autofs_d_automount(struct path *path) { struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never triggers a mount. */ if (autofs_oz_mode(sbi)) return NULL; /* * If an expire request is pending everyone must wait. * If the expire fails we're still mounted so continue * the follow and return. A return of -EAGAIN (which only * happens with indirect mounts) means the expire completed * and the directory was removed, so just go ahead and try * the mount. */ status = do_expire_wait(path, 0); if (status && status != -EAGAIN) return NULL; /* Callback to the daemon to perform the mount or wait */ spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); status = autofs_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; } /* * If the dentry is a symlink it's equivalent to a directory * having path_is_mountpoint() true, so there's no need to call * back to the daemon. */ if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } if (!path_is_mountpoint(path)) { /* * It's possible that user space hasn't removed directories * after umounting a rootless multi-mount, although it * should. For v5 path_has_submounts() is sufficient to * handle this because the leaves of the directory tree under * the mount never trigger mounts themselves (they have an * autofs trigger mount mounted on them). But v4 pseudo direct * mounts do need the leaves to trigger mounts. In this case * we have no choice but to use the autofs_empty() check and * require user space behave. */ if (sbi->version > 4) { if (path_has_submounts(path)) { spin_unlock(&sbi->fs_lock); goto done; } } else { if (!autofs_empty(ino)) { spin_unlock(&sbi->fs_lock); goto done; } } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); status = autofs_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { spin_unlock(&sbi->fs_lock); return ERR_PTR(status); } } spin_unlock(&sbi->fs_lock); done: /* Mount succeeded, check if we ended up with a new dentry */ dentry = autofs_mountpoint_changed(path); if (!dentry) return ERR_PTR(-ENOENT); return NULL; } static int autofs_d_manage(const struct path *path, bool rcu_walk) { struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never waits. */ if (autofs_oz_mode(sbi)) { if (!path_is_mountpoint(path)) return -EISDIR; return 0; } /* Wait for pending expires */ if (do_expire_wait(path, rcu_walk) == -ECHILD) return -ECHILD; /* * This dentry may be under construction so wait on mount * completion. */ status = autofs_mount_wait(path, rcu_walk); if (status) return status; if (rcu_walk) { /* We don't need fs_lock in rcu_walk mode, * just testing 'AUTOFS_INF_WANT_EXPIRE' is enough. * * We only return -EISDIR when certain this isn't * a mount-trap. */ struct inode *inode; if (ino->flags & AUTOFS_INF_WANT_EXPIRE) return 0; if (path_is_mountpoint(path)) return 0; inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) return -EISDIR; if (!autofs_empty(ino)) return -EISDIR; return 0; } spin_lock(&sbi->fs_lock); /* * If the dentry has been selected for expire while we slept * on the lock then it might go away. We'll deal with that in * ->d_automount() and wait on a new mount if the expire * succeeds or return here if it doesn't (since there's no * mount to follow with a rootless multi-mount). */ if (!(ino->flags & AUTOFS_INF_EXPIRING)) { /* * Any needed mounting has been completed and the path * updated so check if this is a rootless multi-mount so * we can avoid needless calls ->d_automount() and avoid * an incorrect ELOOP error return. */ if ((!path_is_mountpoint(path) && !autofs_empty(ino)) || (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } spin_unlock(&sbi->fs_lock); return status; } /* Lookups in the root directory */ static struct dentry *autofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; struct dentry *active; pr_debug("name = %pd\n", dentry); /* File name too long to exist */ if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); sbi = autofs_sbi(dir->i_sb); pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", current->pid, task_pgrp_nr(current), sbi->flags & AUTOFS_SBI_CATATONIC, autofs_oz_mode(sbi)); active = autofs_lookup_active(dentry); if (active) return active; else { /* * A dentry that is not within the root can never trigger a * mount operation, unless the directory already exists, so we * can return fail immediately. The daemon however does need * to create directories within the file system. */ if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) return ERR_PTR(-ENOENT); ino = autofs_new_ino(sbi); if (!ino) return ERR_PTR(-ENOMEM); spin_lock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); /* Mark entries in the root as mount triggers */ if (IS_ROOT(dentry->d_parent) && autofs_type_indirect(sbi->type)) __managed_dentry_set_managed(dentry); dentry->d_fsdata = ino; ino->dentry = dentry; list_add(&ino->active, &sbi->active_list); spin_unlock(&sbi->lookup_lock); spin_unlock(&dentry->d_lock); } return NULL; } static int autofs_dir_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (mask & MAY_WRITE) { struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); if (!autofs_oz_mode(sbi)) return -EACCES; /* autofs_oz_mode() needs to allow path walks when the * autofs mount is catatonic but the state of an autofs * file system needs to be preserved over restarts. */ if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; } return generic_permission(idmap, inode, mask); } static int autofs_dir_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; size_t size = strlen(symname); char *cp; pr_debug("%s <- %pd\n", symname, dentry); BUG_ON(!ino); autofs_clean_ino(ino); autofs_del_active(dentry); cp = kmalloc(size + 1, GFP_KERNEL); if (!cp) return -ENOMEM; strcpy(cp, symname); inode = autofs_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); return -ENOMEM; } inode->i_private = cp; inode->i_size = size; d_add(dentry, inode); dget(dentry); p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return 0; } /* * NOTE! * * Normal filesystems would do a "d_delete()" to tell the VFS dcache * that the file no longer exists. However, doing that means that the * VFS layer can turn the dentry into a negative dentry. We don't want * this, because the unlink is probably the result of an expire. * We simply d_drop it and add it to a expiring list in the super block, * which allows the dentry lookup to check for an incomplete expire. * * If a process is blocked on the dentry waiting for the expire to finish, * it will invalidate the dentry and try to mount with a new one. * * Also see autofs_dir_rmdir().. */ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; dput(ino->dentry); d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); return 0; } /* * Version 4 of autofs provides a pseudo direct mount implementation * that relies on directories at the leaves of a directory tree under * an indirect mount to trigger mounts. To allow for this we need to * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves * of the directory tree. There is no need to clear the automount flag * following a mount or restore it after an expire because these mounts * are always covered. However, it is necessary to ensure that these * flags are clear on non-empty directories to avoid unnecessary calls * during path walks. */ static void autofs_set_leaf_automount_flags(struct dentry *dentry) { struct dentry *parent; /* root and dentrys in the root are already handled */ if (IS_ROOT(dentry->d_parent)) return; managed_dentry_set_managed(dentry); parent = dentry->d_parent; /* only consider parents below dentrys in the root */ if (IS_ROOT(parent->d_parent)) return; managed_dentry_clear_managed(parent); } static void autofs_clear_leaf_automount_flags(struct dentry *dentry) { struct dentry *parent; /* flags for dentrys in the root are handled elsewhere */ if (IS_ROOT(dentry->d_parent)) return; managed_dentry_clear_managed(dentry); parent = dentry->d_parent; /* only consider parents below dentrys in the root */ if (IS_ROOT(parent->d_parent)) return; if (autofs_dentry_ino(parent)->count == 2) managed_dentry_set_managed(parent); } static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; pr_debug("dentry %p, removing %pd\n", dentry, dentry); if (ino->count != 1) return -ENOTEMPTY; spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; dput(ino->dentry); d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); if (dir->i_nlink) drop_nlink(dir); return 0; } static int autofs_dir_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); autofs_clean_ino(ino); autofs_del_active(dentry); inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); if (sbi->version < 5) autofs_set_leaf_automount_flags(dentry); dget(dentry); p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return 0; } /* Get/set timeout ioctl() operation */ #ifdef CONFIG_COMPAT static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, compat_ulong_t __user *p) { unsigned long ntimeout; int rv; rv = get_user(ntimeout, p); if (rv) goto error; rv = put_user(sbi->exp_timeout/HZ, p); if (rv) goto error; if (ntimeout > UINT_MAX/HZ) sbi->exp_timeout = 0; else sbi->exp_timeout = ntimeout * HZ; return 0; error: return rv; } #endif static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, unsigned long __user *p) { unsigned long ntimeout; int rv; rv = get_user(ntimeout, p); if (rv) goto error; rv = put_user(sbi->exp_timeout/HZ, p); if (rv) goto error; if (ntimeout > ULONG_MAX/HZ) sbi->exp_timeout = 0; else sbi->exp_timeout = ntimeout * HZ; return 0; error: return rv; } /* Return protocol version */ static inline int autofs_get_protover(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->version, p); } /* Return protocol sub version */ static inline int autofs_get_protosubver(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->sub_version, p); } /* * Tells the daemon whether it can umount the autofs mount. */ static inline int autofs_ask_umount(struct vfsmount *mnt, int __user *p) { int status = 0; if (may_umount(mnt)) status = 1; pr_debug("may umount %d\n", status); status = put_user(status, p); return status; } /* Identify autofs_dentries - this is so we can tell if there's * an extra dentry refcount or not. We only hold a refcount on the * dentry if its non-negative (ie, d_inode != NULL) */ int is_autofs_dentry(struct dentry *dentry) { return dentry && d_really_is_positive(dentry) && dentry->d_op == &autofs_dentry_operations && dentry->d_fsdata != NULL; } /* * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ static int autofs_root_ioctl_unlocked(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); void __user *p = (void __user *)arg; pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", cmd, arg, sbi, task_pgrp_nr(current)); if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ return autofs_wait_release(sbi, (autofs_wqt_t) arg, 0); case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ return autofs_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ autofs_catatonic_mode(sbi); return 0; case AUTOFS_IOC_PROTOVER: /* Get protocol version */ return autofs_get_protover(sbi, p); case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ return autofs_get_protosubver(sbi, p); case AUTOFS_IOC_SETTIMEOUT: return autofs_get_set_timeout(sbi, p); #ifdef CONFIG_COMPAT case AUTOFS_IOC_SETTIMEOUT32: return autofs_compat_get_set_timeout(sbi, p); #endif case AUTOFS_IOC_ASKUMOUNT: return autofs_ask_umount(filp->f_path.mnt, p); /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: return autofs_expire_run(inode->i_sb, filp->f_path.mnt, sbi, p); /* same as above, but can send multiple expires through pipe */ case AUTOFS_IOC_EXPIRE_MULTI: return autofs_expire_multi(inode->i_sb, filp->f_path.mnt, sbi, p); default: return -EINVAL; } } static long autofs_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); return autofs_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT static long autofs_root_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); int ret; if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) ret = autofs_root_ioctl_unlocked(inode, filp, cmd, arg); else ret = autofs_root_ioctl_unlocked(inode, filp, cmd, (unsigned long) compat_ptr(arg)); return ret; } #endif |
| 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * Copyright (c) Nokia Corporation, 2007 * * Author: Artem Bityutskiy (Битюцкий Артём), * Frank Haverkamp */ /* * This file includes UBI initialization and building of UBI devices. * * When UBI is initialized, it attaches all the MTD devices specified as the * module load parameters or the kernel boot parameters. If MTD devices were * specified, UBI does not attach any MTD device, but it is possible to do * later using the "UBI control device". */ #include <linux/err.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/stringify.h> #include <linux/namei.h> #include <linux/stat.h> #include <linux/miscdevice.h> #include <linux/mtd/partitions.h> #include <linux/log2.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/of.h> #include <linux/slab.h> #include <linux/major.h> #include "ubi.h" /* Maximum length of the 'mtd=' parameter */ #define MTD_PARAM_LEN_MAX 64 /* Maximum number of comma-separated items in the 'mtd=' parameter */ #define MTD_PARAM_MAX_COUNT 6 /* Maximum value for the number of bad PEBs per 1024 PEBs */ #define MAX_MTD_UBI_BEB_LIMIT 768 #ifdef CONFIG_MTD_UBI_MODULE #define ubi_is_module() 1 #else #define ubi_is_module() 0 #endif /** * struct mtd_dev_param - MTD device parameter description data structure. * @name: MTD character device node path, MTD device name, or MTD device number * string * @ubi_num: UBI number * @vid_hdr_offs: VID header offset * @max_beb_per1024: maximum expected number of bad PEBs per 1024 PEBs * @enable_fm: enable fastmap when value is non-zero * @need_resv_pool: reserve pool->max_size pebs when value is none-zero */ struct mtd_dev_param { char name[MTD_PARAM_LEN_MAX]; int ubi_num; int vid_hdr_offs; int max_beb_per1024; int enable_fm; int need_resv_pool; }; /* Numbers of elements set in the @mtd_dev_param array */ static int mtd_devs; /* MTD devices specification parameters */ static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES]; #ifdef CONFIG_MTD_UBI_FASTMAP /* UBI module parameter to enable fastmap automatically on non-fastmap images */ static bool fm_autoconvert; static bool fm_debug; #endif /* Slab cache for wear-leveling entries */ struct kmem_cache *ubi_wl_entry_slab; /* UBI control character device */ static struct miscdevice ubi_ctrl_cdev = { .minor = MISC_DYNAMIC_MINOR, .name = "ubi_ctrl", .fops = &ubi_ctrl_cdev_operations, }; /* All UBI devices in system */ static struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; /* Serializes UBI devices creations and removals */ DEFINE_MUTEX(ubi_devices_mutex); /* Protects @ubi_devices, @ubi->ref_count and @ubi->is_dead */ static DEFINE_SPINLOCK(ubi_devices_lock); /* "Show" method for files in '/<sysfs>/class/ubi/' */ /* UBI version attribute ('/<sysfs>/class/ubi/version') */ static ssize_t version_show(const struct class *class, const struct class_attribute *attr, char *buf) { return sprintf(buf, "%d\n", UBI_VERSION); } static CLASS_ATTR_RO(version); static struct attribute *ubi_class_attrs[] = { &class_attr_version.attr, NULL, }; ATTRIBUTE_GROUPS(ubi_class); /* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */ const struct class ubi_class = { .name = UBI_NAME_STR, .class_groups = ubi_class_groups, }; static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf); /* UBI device attributes (correspond to files in '/<sysfs>/class/ubi/ubiX') */ static struct device_attribute dev_eraseblock_size = __ATTR(eraseblock_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_avail_eraseblocks = __ATTR(avail_eraseblocks, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_total_eraseblocks = __ATTR(total_eraseblocks, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_volumes_count = __ATTR(volumes_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_max_ec = __ATTR(max_ec, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_reserved_for_bad = __ATTR(reserved_for_bad, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bad_peb_count = __ATTR(bad_peb_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_max_vol_count = __ATTR(max_vol_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_min_io_size = __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bgt_enabled = __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_mtd_num = __ATTR(mtd_num, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_ro_mode = __ATTR(ro_mode, S_IRUGO, dev_attribute_show, NULL); /** * ubi_volume_notify - send a volume change notification. * @ubi: UBI device description object * @vol: volume description object of the changed volume * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc) * * This is a helper function which notifies all subscribers about a volume * change event (creation, removal, re-sizing, re-naming, updating). Returns * zero in case of success and a negative error code in case of failure. */ int ubi_volume_notify(struct ubi_device *ubi, struct ubi_volume *vol, int ntype) { int ret; struct ubi_notification nt; ubi_do_get_device_info(ubi, &nt.di); ubi_do_get_volume_info(ubi, vol, &nt.vi); switch (ntype) { case UBI_VOLUME_ADDED: case UBI_VOLUME_REMOVED: case UBI_VOLUME_RESIZED: case UBI_VOLUME_RENAMED: ret = ubi_update_fastmap(ubi); if (ret) ubi_msg(ubi, "Unable to write a new fastmap: %i", ret); } return blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt); } /** * ubi_notify_all - send a notification to all volumes. * @ubi: UBI device description object * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc) * @nb: the notifier to call * * This function walks all volumes of UBI device @ubi and sends the @ntype * notification for each volume. If @nb is %NULL, then all registered notifiers * are called, otherwise only the @nb notifier is called. Returns the number of * sent notifications. */ int ubi_notify_all(struct ubi_device *ubi, int ntype, struct notifier_block *nb) { struct ubi_notification nt; int i, count = 0; ubi_do_get_device_info(ubi, &nt.di); mutex_lock(&ubi->device_mutex); for (i = 0; i < ubi->vtbl_slots; i++) { /* * Since the @ubi->device is locked, and we are not going to * change @ubi->volumes, we do not have to lock * @ubi->volumes_lock. */ if (!ubi->volumes[i]) continue; ubi_do_get_volume_info(ubi, ubi->volumes[i], &nt.vi); if (nb) nb->notifier_call(nb, ntype, &nt); else blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt); count += 1; } mutex_unlock(&ubi->device_mutex); return count; } /** * ubi_enumerate_volumes - send "add" notification for all existing volumes. * @nb: the notifier to call * * This function walks all UBI devices and volumes and sends the * %UBI_VOLUME_ADDED notification for each volume. If @nb is %NULL, then all * registered notifiers are called, otherwise only the @nb notifier is called. * Returns the number of sent notifications. */ int ubi_enumerate_volumes(struct notifier_block *nb) { int i, count = 0; /* * Since the @ubi_devices_mutex is locked, and we are not going to * change @ubi_devices, we do not have to lock @ubi_devices_lock. */ for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; if (!ubi) continue; count += ubi_notify_all(ubi, UBI_VOLUME_ADDED, nb); } return count; } /** * ubi_get_device - get UBI device. * @ubi_num: UBI device number * * This function returns UBI device description object for UBI device number * @ubi_num, or %NULL if the device does not exist. This function increases the * device reference count to prevent removal of the device. In other words, the * device cannot be removed if its reference count is not zero. */ struct ubi_device *ubi_get_device(int ubi_num) { struct ubi_device *ubi; spin_lock(&ubi_devices_lock); ubi = ubi_devices[ubi_num]; if (ubi && ubi->is_dead) ubi = NULL; if (ubi) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); } spin_unlock(&ubi_devices_lock); return ubi; } /** * ubi_put_device - drop an UBI device reference. * @ubi: UBI device description object */ void ubi_put_device(struct ubi_device *ubi) { spin_lock(&ubi_devices_lock); ubi->ref_count -= 1; put_device(&ubi->dev); spin_unlock(&ubi_devices_lock); } /** * ubi_get_by_major - get UBI device by character device major number. * @major: major number * * This function is similar to 'ubi_get_device()', but it searches the device * by its major number. */ struct ubi_device *ubi_get_by_major(int major) { int i; struct ubi_device *ubi; spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); spin_unlock(&ubi_devices_lock); return ubi; } } spin_unlock(&ubi_devices_lock); return NULL; } /** * ubi_major2num - get UBI device number by character device major number. * @major: major number * * This function searches UBI device number object by its major number. If UBI * device was not found, this function returns -ENODEV, otherwise the UBI device * number is returned. */ int ubi_major2num(int major) { int i, ubi_num = -ENODEV; spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_num = ubi->ubi_num; break; } } spin_unlock(&ubi_devices_lock); return ubi_num; } /* "Show" method for files in '/<sysfs>/class/ubi/ubiX/' */ static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct ubi_device *ubi; /* * The below code looks weird, but it actually makes sense. We get the * UBI device reference from the contained 'struct ubi_device'. But it * is unclear if the device was removed or not yet. Indeed, if the * device was removed before we increased its reference count, * 'ubi_get_device()' will return -ENODEV and we fail. * * Remember, 'struct ubi_device' is freed in the release function, so * we still can use 'ubi->ubi_num'. */ ubi = container_of(dev, struct ubi_device, dev); if (attr == &dev_eraseblock_size) ret = sprintf(buf, "%d\n", ubi->leb_size); else if (attr == &dev_avail_eraseblocks) ret = sprintf(buf, "%d\n", ubi->avail_pebs); else if (attr == &dev_total_eraseblocks) ret = sprintf(buf, "%d\n", ubi->good_peb_count); else if (attr == &dev_volumes_count) ret = sprintf(buf, "%d\n", ubi->vol_count - UBI_INT_VOL_COUNT); else if (attr == &dev_max_ec) ret = sprintf(buf, "%d\n", ubi->max_ec); else if (attr == &dev_reserved_for_bad) ret = sprintf(buf, "%d\n", ubi->beb_rsvd_pebs); else if (attr == &dev_bad_peb_count) ret = sprintf(buf, "%d\n", ubi->bad_peb_count); else if (attr == &dev_max_vol_count) ret = sprintf(buf, "%d\n", ubi->vtbl_slots); else if (attr == &dev_min_io_size) ret = sprintf(buf, "%d\n", ubi->min_io_size); else if (attr == &dev_bgt_enabled) ret = sprintf(buf, "%d\n", ubi->thread_enabled); else if (attr == &dev_mtd_num) ret = sprintf(buf, "%d\n", ubi->mtd->index); else if (attr == &dev_ro_mode) ret = sprintf(buf, "%d\n", ubi->ro_mode); else ret = -EINVAL; return ret; } static struct attribute *ubi_dev_attrs[] = { &dev_eraseblock_size.attr, &dev_avail_eraseblocks.attr, &dev_total_eraseblocks.attr, &dev_volumes_count.attr, &dev_max_ec.attr, &dev_reserved_for_bad.attr, &dev_bad_peb_count.attr, &dev_max_vol_count.attr, &dev_min_io_size.attr, &dev_bgt_enabled.attr, &dev_mtd_num.attr, &dev_ro_mode.attr, NULL }; ATTRIBUTE_GROUPS(ubi_dev); static void dev_release(struct device *dev) { struct ubi_device *ubi = container_of(dev, struct ubi_device, dev); kfree(ubi); } /** * kill_volumes - destroy all user volumes. * @ubi: UBI device description object */ static void kill_volumes(struct ubi_device *ubi) { int i; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) ubi_free_volume(ubi, ubi->volumes[i]); } /** * uif_init - initialize user interfaces for an UBI device. * @ubi: UBI device description object * * This function initializes various user interfaces for an UBI device. If the * initialization fails at an early stage, this function frees all the * resources it allocated, returns an error. * * This function returns zero in case of success and a negative error code in * case of failure. */ static int uif_init(struct ubi_device *ubi) { int i, err; dev_t dev; sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num); /* * Major numbers for the UBI character devices are allocated * dynamically. Major numbers of volume character devices are * equivalent to ones of the corresponding UBI character device. Minor * numbers of UBI character devices are 0, while minor numbers of * volume character devices start from 1. Thus, we allocate one major * number and ubi->vtbl_slots + 1 minor numbers. */ err = alloc_chrdev_region(&dev, 0, ubi->vtbl_slots + 1, ubi->ubi_name); if (err) { ubi_err(ubi, "cannot register UBI character devices"); return err; } ubi->dev.devt = dev; ubi_assert(MINOR(dev) == 0); cdev_init(&ubi->cdev, &ubi_cdev_operations); dbg_gen("%s major is %u", ubi->ubi_name, MAJOR(dev)); ubi->cdev.owner = THIS_MODULE; dev_set_name(&ubi->dev, UBI_NAME_STR "%d", ubi->ubi_num); err = cdev_device_add(&ubi->cdev, &ubi->dev); if (err) goto out_unreg; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) { err = ubi_add_volume(ubi, ubi->volumes[i]); if (err) { ubi_err(ubi, "cannot add volume %d", i); ubi->volumes[i] = NULL; goto out_volumes; } } return 0; out_volumes: kill_volumes(ubi); cdev_device_del(&ubi->cdev, &ubi->dev); out_unreg: unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); ubi_err(ubi, "cannot initialize UBI %s, error %d", ubi->ubi_name, err); return err; } /** * uif_close - close user interfaces for an UBI device. * @ubi: UBI device description object * * Note, since this function un-registers UBI volume device objects (@vol->dev), * the memory allocated voe the volumes is freed as well (in the release * function). */ static void uif_close(struct ubi_device *ubi) { kill_volumes(ubi); cdev_device_del(&ubi->cdev, &ubi->dev); unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); } /** * ubi_free_volumes_from - free volumes from specific index. * @ubi: UBI device description object * @from: the start index used for volume free. */ static void ubi_free_volumes_from(struct ubi_device *ubi, int from) { int i; for (i = from; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { if (!ubi->volumes[i] || ubi->volumes[i]->is_dead) continue; ubi_eba_replace_table(ubi->volumes[i], NULL); ubi_fastmap_destroy_checkmap(ubi->volumes[i]); kfree(ubi->volumes[i]); ubi->volumes[i] = NULL; } } /** * ubi_free_all_volumes - free all volumes. * @ubi: UBI device description object */ void ubi_free_all_volumes(struct ubi_device *ubi) { ubi_free_volumes_from(ubi, 0); } /** * ubi_free_internal_volumes - free internal volumes. * @ubi: UBI device description object */ void ubi_free_internal_volumes(struct ubi_device *ubi) { ubi_free_volumes_from(ubi, ubi->vtbl_slots); } static int get_bad_peb_limit(const struct ubi_device *ubi, int max_beb_per1024) { int limit, device_pebs; uint64_t device_size; if (!max_beb_per1024) { /* * Since max_beb_per1024 has not been set by the user in either * the cmdline or Kconfig, use mtd_max_bad_blocks to set the * limit if it is supported by the device. */ limit = mtd_max_bad_blocks(ubi->mtd, 0, ubi->mtd->size); if (limit < 0) return 0; return limit; } /* * Here we are using size of the entire flash chip and * not just the MTD partition size because the maximum * number of bad eraseblocks is a percentage of the * whole device and bad eraseblocks are not fairly * distributed over the flash chip. So the worst case * is that all the bad eraseblocks of the chip are in * the MTD partition we are attaching (ubi->mtd). */ device_size = mtd_get_device_size(ubi->mtd); device_pebs = mtd_div_by_eb(device_size, ubi->mtd); limit = mult_frac(device_pebs, max_beb_per1024, 1024); /* Round it up */ if (mult_frac(limit, 1024, max_beb_per1024) < device_pebs) limit += 1; return limit; } /** * io_init - initialize I/O sub-system for a given UBI device. * @ubi: UBI device description object * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are * assumed: * o EC header is always at offset zero - this cannot be changed; * o VID header starts just after the EC header at the closest address * aligned to @io->hdrs_min_io_size; * o data starts just after the VID header at the closest address aligned to * @io->min_io_size * * This function returns zero in case of success and a negative error code in * case of failure. */ static int io_init(struct ubi_device *ubi, int max_beb_per1024) { dbg_gen("sizeof(struct ubi_ainf_peb) %zu", sizeof(struct ubi_ainf_peb)); dbg_gen("sizeof(struct ubi_wl_entry) %zu", sizeof(struct ubi_wl_entry)); if (ubi->mtd->numeraseregions != 0) { /* * Some flashes have several erase regions. Different regions * may have different eraseblock size and other * characteristics. It looks like mostly multi-region flashes * have one "main" region and one or more small regions to * store boot loader code or boot parameters or whatever. I * guess we should just pick the largest region. But this is * not implemented. */ ubi_err(ubi, "multiple regions, not implemented"); return -EINVAL; } if (ubi->vid_hdr_offset < 0) return -EINVAL; /* * Note, in this implementation we support MTD devices with 0x7FFFFFFF * physical eraseblocks maximum. */ ubi->peb_size = ubi->mtd->erasesize; ubi->peb_count = mtd_div_by_eb(ubi->mtd->size, ubi->mtd); ubi->flash_size = ubi->mtd->size; if (mtd_can_have_bb(ubi->mtd)) { ubi->bad_allowed = 1; ubi->bad_peb_limit = get_bad_peb_limit(ubi, max_beb_per1024); } if (ubi->mtd->type == MTD_NORFLASH) ubi->nor_flash = 1; ubi->min_io_size = ubi->mtd->writesize; ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft; /* * Make sure minimal I/O unit is power of 2. Note, there is no * fundamental reason for this assumption. It is just an optimization * which allows us to avoid costly division operations. */ if (!is_power_of_2(ubi->min_io_size)) { ubi_err(ubi, "min. I/O unit (%d) is not power of 2", ubi->min_io_size); return -EINVAL; } ubi_assert(ubi->hdrs_min_io_size > 0); ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size); ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0); ubi->max_write_size = ubi->mtd->writebufsize; /* * Maximum write size has to be greater or equivalent to min. I/O * size, and be multiple of min. I/O size. */ if (ubi->max_write_size < ubi->min_io_size || ubi->max_write_size % ubi->min_io_size || !is_power_of_2(ubi->max_write_size)) { ubi_err(ubi, "bad write buffer size %d for %d min. I/O unit", ubi->max_write_size, ubi->min_io_size); return -EINVAL; } /* Calculate default aligned sizes of EC and VID headers */ ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); dbg_gen("min_io_size %d", ubi->min_io_size); dbg_gen("max_write_size %d", ubi->max_write_size); dbg_gen("hdrs_min_io_size %d", ubi->hdrs_min_io_size); dbg_gen("ec_hdr_alsize %d", ubi->ec_hdr_alsize); dbg_gen("vid_hdr_alsize %d", ubi->vid_hdr_alsize); if (ubi->vid_hdr_offset == 0) /* Default offset */ ubi->vid_hdr_offset = ubi->vid_hdr_aloffset = ubi->ec_hdr_alsize; else { ubi->vid_hdr_aloffset = ubi->vid_hdr_offset & ~(ubi->hdrs_min_io_size - 1); ubi->vid_hdr_shift = ubi->vid_hdr_offset - ubi->vid_hdr_aloffset; } /* * Memory allocation for VID header is ubi->vid_hdr_alsize * which is described in comments in io.c. * Make sure VID header shift + UBI_VID_HDR_SIZE not exceeds * ubi->vid_hdr_alsize, so that all vid header operations * won't access memory out of bounds. */ if ((ubi->vid_hdr_shift + UBI_VID_HDR_SIZE) > ubi->vid_hdr_alsize) { ubi_err(ubi, "Invalid VID header offset %d, VID header shift(%d)" " + VID header size(%zu) > VID header aligned size(%d).", ubi->vid_hdr_offset, ubi->vid_hdr_shift, UBI_VID_HDR_SIZE, ubi->vid_hdr_alsize); return -EINVAL; } /* Similar for the data offset */ ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE; ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); dbg_gen("vid_hdr_offset %d", ubi->vid_hdr_offset); dbg_gen("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset); dbg_gen("vid_hdr_shift %d", ubi->vid_hdr_shift); dbg_gen("leb_start %d", ubi->leb_start); /* The shift must be aligned to 32-bit boundary */ if (ubi->vid_hdr_shift % 4) { ubi_err(ubi, "unaligned VID header shift %d", ubi->vid_hdr_shift); return -EINVAL; } /* Check sanity */ if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE || ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE || ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE || ubi->leb_start & (ubi->min_io_size - 1)) { ubi_err(ubi, "bad VID header (%d) or data offsets (%d)", ubi->vid_hdr_offset, ubi->leb_start); return -EINVAL; } /* * Set maximum amount of physical erroneous eraseblocks to be 10%. * Erroneous PEB are those which have read errors. */ ubi->max_erroneous = ubi->peb_count / 10; if (ubi->max_erroneous < 16) ubi->max_erroneous = 16; dbg_gen("max_erroneous %d", ubi->max_erroneous); /* * It may happen that EC and VID headers are situated in one minimal * I/O unit. In this case we can only accept this UBI image in * read-only mode. */ if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) { ubi_warn(ubi, "EC and VID headers are in the same minimal I/O unit, switch to read-only mode"); ubi->ro_mode = 1; } ubi->leb_size = ubi->peb_size - ubi->leb_start; if (!(ubi->mtd->flags & MTD_WRITEABLE)) { ubi_msg(ubi, "MTD device %d is write-protected, attach in read-only mode", ubi->mtd->index); ubi->ro_mode = 1; } /* * Note, ideally, we have to initialize @ubi->bad_peb_count here. But * unfortunately, MTD does not provide this information. We should loop * over all physical eraseblocks and invoke mtd->block_is_bad() for * each physical eraseblock. So, we leave @ubi->bad_peb_count * uninitialized so far. */ return 0; } /** * autoresize - re-size the volume which has the "auto-resize" flag set. * @ubi: UBI device description object * @vol_id: ID of the volume to re-size * * This function re-sizes the volume marked by the %UBI_VTBL_AUTORESIZE_FLG in * the volume table to the largest possible size. See comments in ubi-header.h * for more description of the flag. Returns zero in case of success and a * negative error code in case of failure. */ static int autoresize(struct ubi_device *ubi, int vol_id) { struct ubi_volume_desc desc; struct ubi_volume *vol = ubi->volumes[vol_id]; int err, old_reserved_pebs = vol->reserved_pebs; if (ubi->ro_mode) { ubi_warn(ubi, "skip auto-resize because of R/O mode"); return 0; } /* * Clear the auto-resize flag in the volume in-memory copy of the * volume table, and 'ubi_resize_volume()' will propagate this change * to the flash. */ ubi->vtbl[vol_id].flags &= ~UBI_VTBL_AUTORESIZE_FLG; if (ubi->avail_pebs == 0) { struct ubi_vtbl_record vtbl_rec; /* * No available PEBs to re-size the volume, clear the flag on * flash and exit. */ vtbl_rec = ubi->vtbl[vol_id]; err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); if (err) ubi_err(ubi, "cannot clean auto-resize flag for volume %d", vol_id); } else { desc.vol = vol; err = ubi_resize_volume(&desc, old_reserved_pebs + ubi->avail_pebs); if (err) ubi_err(ubi, "cannot auto-resize volume %d", vol_id); } if (err) return err; ubi_msg(ubi, "volume %d (\"%s\") re-sized from %d to %d LEBs", vol_id, vol->name, old_reserved_pebs, vol->reserved_pebs); return 0; } /** * ubi_attach_mtd_dev - attach an MTD device. * @mtd: MTD device description object * @ubi_num: number to assign to the new UBI device * @vid_hdr_offset: VID header offset * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * @disable_fm: whether disable fastmap * @need_resv_pool: whether reserve pebs to fill fm_pool * * This function attaches MTD device @mtd_dev to UBI and assign @ubi_num number * to the newly created UBI device, unless @ubi_num is %UBI_DEV_NUM_AUTO, in * which case this function finds a vacant device number and assigns it * automatically. Returns the new UBI device number in case of success and a * negative error code in case of failure. * * If @disable_fm is true, ubi doesn't create new fastmap even the module param * 'fm_autoconvert' is set, and existed old fastmap will be destroyed after * doing full scanning. * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset, int max_beb_per1024, bool disable_fm, bool need_resv_pool) { struct ubi_device *ubi; int i, err; if (max_beb_per1024 < 0 || max_beb_per1024 > MAX_MTD_UBI_BEB_LIMIT) return -EINVAL; if (!max_beb_per1024) max_beb_per1024 = CONFIG_MTD_UBI_BEB_LIMIT; /* * Check if we already have the same MTD device attached. * * Note, this function assumes that UBI devices creations and deletions * are serialized, so it does not take the &ubi_devices_lock. */ for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && mtd->index == ubi->mtd->index) { pr_err("ubi: mtd%d is already attached to ubi%d\n", mtd->index, i); return -EEXIST; } } /* * Make sure this MTD device is not emulated on top of an UBI volume * already. Well, generally this recursion works fine, but there are * different problems like the UBI module takes a reference to itself * by attaching (and thus, opening) the emulated MTD device. This * results in inability to unload the module. And in general it makes * no sense to attach emulated MTD devices, so we prohibit this. */ if (mtd->type == MTD_UBIVOLUME) { pr_err("ubi: refuse attaching mtd%d - it is already emulated on top of UBI\n", mtd->index); return -EINVAL; } /* * Both UBI and UBIFS have been designed for SLC NAND and NOR flashes. * MLC NAND is different and needs special care, otherwise UBI or UBIFS * will die soon and you will lose all your data. * Relax this rule if the partition we're attaching to operates in SLC * mode. */ if (mtd->type == MTD_MLCNANDFLASH && !(mtd->flags & MTD_SLC_ON_MLC_EMULATION)) { pr_err("ubi: refuse attaching mtd%d - MLC NAND is not supported\n", mtd->index); return -EINVAL; } /* UBI cannot work on flashes with zero erasesize. */ if (!mtd->erasesize) { pr_err("ubi: refuse attaching mtd%d - zero erasesize flash is not supported\n", mtd->index); return -EINVAL; } if (ubi_num == UBI_DEV_NUM_AUTO) { /* Search for an empty slot in the @ubi_devices array */ for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++) if (!ubi_devices[ubi_num]) break; if (ubi_num == UBI_MAX_DEVICES) { pr_err("ubi: only %d UBI devices may be created\n", UBI_MAX_DEVICES); return -ENFILE; } } else { if (ubi_num >= UBI_MAX_DEVICES) return -EINVAL; /* Make sure ubi_num is not busy */ if (ubi_devices[ubi_num]) { pr_err("ubi: ubi%i already exists\n", ubi_num); return -EEXIST; } } ubi = kzalloc(sizeof(struct ubi_device), GFP_KERNEL); if (!ubi) return -ENOMEM; device_initialize(&ubi->dev); ubi->dev.release = dev_release; ubi->dev.class = &ubi_class; ubi->dev.groups = ubi_dev_groups; ubi->dev.parent = &mtd->dev; ubi->mtd = mtd; ubi->ubi_num = ubi_num; ubi->vid_hdr_offset = vid_hdr_offset; ubi->autoresize_vol_id = -1; #ifdef CONFIG_MTD_UBI_FASTMAP ubi->fm_pool.used = ubi->fm_pool.size = 0; ubi->fm_wl_pool.used = ubi->fm_wl_pool.size = 0; /* * fm_pool.max_size is 5% of the total number of PEBs but it's also * between UBI_FM_MAX_POOL_SIZE and UBI_FM_MIN_POOL_SIZE. */ ubi->fm_pool.max_size = min(((int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd) / 100) * 5, UBI_FM_MAX_POOL_SIZE); ubi->fm_pool.max_size = max(ubi->fm_pool.max_size, UBI_FM_MIN_POOL_SIZE); ubi->fm_wl_pool.max_size = ubi->fm_pool.max_size / 2; ubi->fm_pool_rsv_cnt = need_resv_pool ? ubi->fm_pool.max_size : 0; ubi->fm_disabled = (!fm_autoconvert || disable_fm) ? 1 : 0; if (fm_debug) ubi_enable_dbg_chk_fastmap(ubi); if (!ubi->fm_disabled && (int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd) <= UBI_FM_MAX_START) { ubi_err(ubi, "More than %i PEBs are needed for fastmap, sorry.", UBI_FM_MAX_START); ubi->fm_disabled = 1; } ubi_msg(ubi, "default fastmap pool size: %d", ubi->fm_pool.max_size); ubi_msg(ubi, "default fastmap WL pool size: %d", ubi->fm_wl_pool.max_size); #else ubi->fm_disabled = 1; #endif mutex_init(&ubi->buf_mutex); mutex_init(&ubi->ckvol_mutex); mutex_init(&ubi->device_mutex); spin_lock_init(&ubi->volumes_lock); init_rwsem(&ubi->fm_protect); init_rwsem(&ubi->fm_eba_sem); ubi_msg(ubi, "attaching mtd%d", mtd->index); err = io_init(ubi, max_beb_per1024); if (err) goto out_free; err = -ENOMEM; ubi->peb_buf = vmalloc(ubi->peb_size); if (!ubi->peb_buf) goto out_free; #ifdef CONFIG_MTD_UBI_FASTMAP ubi->fm_size = ubi_calc_fm_size(ubi); ubi->fm_buf = vzalloc(ubi->fm_size); if (!ubi->fm_buf) goto out_free; #endif err = ubi_attach(ubi, disable_fm ? 1 : 0); if (err) { ubi_err(ubi, "failed to attach mtd%d, error %d", mtd->index, err); goto out_free; } if (ubi->autoresize_vol_id != -1) { err = autoresize(ubi, ubi->autoresize_vol_id); if (err) goto out_detach; } err = uif_init(ubi); if (err) goto out_detach; err = ubi_debugfs_init_dev(ubi); if (err) goto out_uif; ubi->bgt_thread = kthread_create(ubi_thread, ubi, "%s", ubi->bgt_name); if (IS_ERR(ubi->bgt_thread)) { err = PTR_ERR(ubi->bgt_thread); ubi_err(ubi, "cannot spawn \"%s\", error %d", ubi->bgt_name, err); goto out_debugfs; } ubi_msg(ubi, "attached mtd%d (name \"%s\", size %llu MiB)", mtd->index, mtd->name, ubi->flash_size >> 20); ubi_msg(ubi, "PEB size: %d bytes (%d KiB), LEB size: %d bytes", ubi->peb_size, ubi->peb_size >> 10, ubi->leb_size); ubi_msg(ubi, "min./max. I/O unit sizes: %d/%d, sub-page size %d", ubi->min_io_size, ubi->max_write_size, ubi->hdrs_min_io_size); ubi_msg(ubi, "VID header offset: %d (aligned %d), data offset: %d", ubi->vid_hdr_offset, ubi->vid_hdr_aloffset, ubi->leb_start); ubi_msg(ubi, "good PEBs: %d, bad PEBs: %d, corrupted PEBs: %d", ubi->good_peb_count, ubi->bad_peb_count, ubi->corr_peb_count); ubi_msg(ubi, "user volume: %d, internal volumes: %d, max. volumes count: %d", ubi->vol_count - UBI_INT_VOL_COUNT, UBI_INT_VOL_COUNT, ubi->vtbl_slots); ubi_msg(ubi, "max/mean erase counter: %d/%d, WL threshold: %d, image sequence number: %u", ubi->max_ec, ubi->mean_ec, CONFIG_MTD_UBI_WL_THRESHOLD, ubi->image_seq); ubi_msg(ubi, "available PEBs: %d, total reserved PEBs: %d, PEBs reserved for bad PEB handling: %d", ubi->avail_pebs, ubi->rsvd_pebs, ubi->beb_rsvd_pebs); /* * The below lock makes sure we do not race with 'ubi_thread()' which * checks @ubi->thread_enabled. Otherwise we may fail to wake it up. */ spin_lock(&ubi->wl_lock); ubi->thread_enabled = 1; wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); ubi_devices[ubi_num] = ubi; ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; out_debugfs: ubi_debugfs_exit_dev(ubi); out_uif: uif_close(ubi); out_detach: ubi_wl_close(ubi); ubi_free_all_volumes(ubi); vfree(ubi->vtbl); out_free: vfree(ubi->peb_buf); vfree(ubi->fm_buf); put_device(&ubi->dev); return err; } /** * ubi_detach_mtd_dev - detach an MTD device. * @ubi_num: UBI device number to detach from * @anyway: detach MTD even if device reference count is not zero * * This function destroys an UBI device number @ubi_num and detaches the * underlying MTD device. Returns zero in case of success and %-EBUSY if the * UBI device is busy and cannot be destroyed, and %-EINVAL if it does not * exist. * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_detach_mtd_dev(int ubi_num, int anyway) { struct ubi_device *ubi; if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES) return -EINVAL; ubi = ubi_get_device(ubi_num); if (!ubi) return -EINVAL; spin_lock(&ubi_devices_lock); ubi->ref_count -= 1; if (ubi->ref_count) { if (!anyway) { spin_unlock(&ubi_devices_lock); return -EBUSY; } /* This may only happen if there is a bug */ ubi_err(ubi, "%s reference count %d, destroy anyway", ubi->ubi_name, ubi->ref_count); } ubi->is_dead = true; spin_unlock(&ubi_devices_lock); ubi_notify_all(ubi, UBI_VOLUME_SHUTDOWN, NULL); spin_lock(&ubi_devices_lock); put_device(&ubi->dev); ubi_devices[ubi_num] = NULL; spin_unlock(&ubi_devices_lock); ubi_assert(ubi_num == ubi->ubi_num); ubi_notify_all(ubi, UBI_VOLUME_REMOVED, NULL); ubi_msg(ubi, "detaching mtd%d", ubi->mtd->index); #ifdef CONFIG_MTD_UBI_FASTMAP /* If we don't write a new fastmap at detach time we lose all * EC updates that have been made since the last written fastmap. * In case of fastmap debugging we omit the update to simulate an * unclean shutdown. */ if (!ubi_dbg_chk_fastmap(ubi)) ubi_update_fastmap(ubi); #endif /* * Before freeing anything, we have to stop the background thread to * prevent it from doing anything on this device while we are freeing. */ if (ubi->bgt_thread) kthread_stop(ubi->bgt_thread); #ifdef CONFIG_MTD_UBI_FASTMAP cancel_work_sync(&ubi->fm_work); #endif ubi_debugfs_exit_dev(ubi); uif_close(ubi); ubi_wl_close(ubi); ubi_free_internal_volumes(ubi); vfree(ubi->vtbl); vfree(ubi->peb_buf); vfree(ubi->fm_buf); ubi_msg(ubi, "mtd%d is detached", ubi->mtd->index); put_mtd_device(ubi->mtd); put_device(&ubi->dev); return 0; } /** * open_mtd_by_chdev - open an MTD device by its character device node path. * @mtd_dev: MTD character device node path * * This helper function opens an MTD device by its character node device path. * Returns MTD device description object in case of success and a negative * error code in case of failure. */ static struct mtd_info * __init open_mtd_by_chdev(const char *mtd_dev) { int err, minor; struct path path; struct kstat stat; /* Probably this is an MTD character device node path */ err = kern_path(mtd_dev, LOOKUP_FOLLOW, &path); if (err) return ERR_PTR(err); err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); path_put(&path); if (err) return ERR_PTR(err); /* MTD device number is defined by the major / minor numbers */ if (MAJOR(stat.rdev) != MTD_CHAR_MAJOR || !S_ISCHR(stat.mode)) return ERR_PTR(-EINVAL); minor = MINOR(stat.rdev); if (minor & 1) /* * Just do not think the "/dev/mtdrX" devices support is need, * so do not support them to avoid doing extra work. */ return ERR_PTR(-EINVAL); return get_mtd_device(NULL, minor / 2); } /** * open_mtd_device - open MTD device by name, character device path, or number. * @mtd_dev: name, character device node path, or MTD device device number * * This function tries to open and MTD device described by @mtd_dev string, * which is first treated as ASCII MTD device number, and if it is not true, it * is treated as MTD device name, and if that is also not true, it is treated * as MTD character device node path. Returns MTD device description object in * case of success and a negative error code in case of failure. */ static struct mtd_info * __init open_mtd_device(const char *mtd_dev) { struct mtd_info *mtd; int mtd_num; char *endp; mtd_num = simple_strtoul(mtd_dev, &endp, 0); if (*endp != '\0' || mtd_dev == endp) { /* * This does not look like an ASCII integer, probably this is * MTD device name. */ mtd = get_mtd_device_nm(mtd_dev); if (PTR_ERR(mtd) == -ENODEV) /* Probably this is an MTD character device node path */ mtd = open_mtd_by_chdev(mtd_dev); } else mtd = get_mtd_device(NULL, mtd_num); return mtd; } static void ubi_notify_add(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); int err; if (!of_device_is_compatible(np, "linux,ubi")) return; /* * we are already holding &mtd_table_mutex, but still need * to bump refcount */ err = __get_mtd_device(mtd); if (err) return; /* called while holding mtd_table_mutex */ mutex_lock_nested(&ubi_devices_mutex, SINGLE_DEPTH_NESTING); err = ubi_attach_mtd_dev(mtd, UBI_DEV_NUM_AUTO, 0, 0, false, false); mutex_unlock(&ubi_devices_mutex); if (err < 0) __put_mtd_device(mtd); } static void ubi_notify_remove(struct mtd_info *mtd) { /* do nothing for now */ } static struct mtd_notifier ubi_mtd_notifier = { .add = ubi_notify_add, .remove = ubi_notify_remove, }; static int __init ubi_init_attach(void) { int err, i, k; /* Attach MTD devices */ for (i = 0; i < mtd_devs; i++) { struct mtd_dev_param *p = &mtd_dev_param[i]; struct mtd_info *mtd; cond_resched(); mtd = open_mtd_device(p->name); if (IS_ERR(mtd)) { err = PTR_ERR(mtd); pr_err("UBI error: cannot open mtd %s, error %d\n", p->name, err); /* See comment below re-ubi_is_module(). */ if (ubi_is_module()) goto out_detach; continue; } mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, p->ubi_num, p->vid_hdr_offs, p->max_beb_per1024, p->enable_fm == 0, p->need_resv_pool != 0); mutex_unlock(&ubi_devices_mutex); if (err < 0) { pr_err("UBI error: cannot attach mtd%d\n", mtd->index); put_mtd_device(mtd); /* * Originally UBI stopped initializing on any error. * However, later on it was found out that this * behavior is not very good when UBI is compiled into * the kernel and the MTD devices to attach are passed * through the command line. Indeed, UBI failure * stopped whole boot sequence. * * To fix this, we changed the behavior for the * non-module case, but preserved the old behavior for * the module case, just for compatibility. This is a * little inconsistent, though. */ if (ubi_is_module()) goto out_detach; } } return 0; out_detach: for (k = 0; k < i; k++) if (ubi_devices[k]) { mutex_lock(&ubi_devices_mutex); ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } return err; } #ifndef CONFIG_MTD_UBI_MODULE late_initcall(ubi_init_attach); #endif static int __init ubi_init(void) { int err; /* Ensure that EC and VID headers have correct size */ BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); if (mtd_devs > UBI_MAX_DEVICES) { pr_err("UBI error: too many MTD devices, maximum is %d\n", UBI_MAX_DEVICES); return -EINVAL; } /* Create base sysfs directory and sysfs files */ err = class_register(&ubi_class); if (err < 0) return err; err = misc_register(&ubi_ctrl_cdev); if (err) { pr_err("UBI error: cannot register device\n"); goto out; } ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", sizeof(struct ubi_wl_entry), 0, 0, NULL); if (!ubi_wl_entry_slab) { err = -ENOMEM; goto out_dev_unreg; } err = ubi_debugfs_init(); if (err) goto out_slab; err = ubiblock_init(); if (err) { pr_err("UBI error: block: cannot initialize, error %d\n", err); /* See comment above re-ubi_is_module(). */ if (ubi_is_module()) goto out_debugfs; } register_mtd_user(&ubi_mtd_notifier); if (ubi_is_module()) { err = ubi_init_attach(); if (err) goto out_mtd_notifier; } return 0; out_mtd_notifier: unregister_mtd_user(&ubi_mtd_notifier); ubiblock_exit(); out_debugfs: ubi_debugfs_exit(); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: misc_deregister(&ubi_ctrl_cdev); out: class_unregister(&ubi_class); pr_err("UBI error: cannot initialize UBI, error %d\n", err); return err; } device_initcall(ubi_init); static void __exit ubi_exit(void) { int i; ubiblock_exit(); unregister_mtd_user(&ubi_mtd_notifier); for (i = 0; i < UBI_MAX_DEVICES; i++) if (ubi_devices[i]) { mutex_lock(&ubi_devices_mutex); ubi_detach_mtd_dev(ubi_devices[i]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } ubi_debugfs_exit(); kmem_cache_destroy(ubi_wl_entry_slab); misc_deregister(&ubi_ctrl_cdev); class_unregister(&ubi_class); } module_exit(ubi_exit); /** * bytes_str_to_int - convert a number of bytes string into an integer. * @str: the string to convert * * This function returns positive resulting integer in case of success and a * negative error code in case of failure. */ static int bytes_str_to_int(const char *str) { char *endp; unsigned long result; result = simple_strtoul(str, &endp, 0); if (str == endp || result >= INT_MAX) { pr_err("UBI error: incorrect bytes count: \"%s\"\n", str); return -EINVAL; } switch (*endp) { case 'G': result *= 1024; fallthrough; case 'M': result *= 1024; fallthrough; case 'K': result *= 1024; break; case '\0': break; default: pr_err("UBI error: incorrect bytes count: \"%s\"\n", str); return -EINVAL; } return result; } /** * ubi_mtd_param_parse - parse the 'mtd=' UBI parameter. * @val: the parameter value to parse * @kp: not used * * This function returns zero in case of success and a negative error code in * case of error. */ static int ubi_mtd_param_parse(const char *val, const struct kernel_param *kp) { int i, len; struct mtd_dev_param *p; char buf[MTD_PARAM_LEN_MAX]; char *pbuf = &buf[0]; char *tokens[MTD_PARAM_MAX_COUNT], *token; if (!val) return -EINVAL; if (mtd_devs == UBI_MAX_DEVICES) { pr_err("UBI error: too many parameters, max. is %d\n", UBI_MAX_DEVICES); return -EINVAL; } len = strnlen(val, MTD_PARAM_LEN_MAX); if (len == MTD_PARAM_LEN_MAX) { pr_err("UBI error: parameter \"%s\" is too long, max. is %d\n", val, MTD_PARAM_LEN_MAX); return -EINVAL; } if (len == 0) { pr_warn("UBI warning: empty 'mtd=' parameter - ignored\n"); return 0; } strcpy(buf, val); /* Get rid of the final newline */ if (buf[len - 1] == '\n') buf[len - 1] = '\0'; for (i = 0; i < MTD_PARAM_MAX_COUNT; i++) tokens[i] = strsep(&pbuf, ","); if (pbuf) { pr_err("UBI error: too many arguments at \"%s\"\n", val); return -EINVAL; } p = &mtd_dev_param[mtd_devs]; strcpy(&p->name[0], tokens[0]); token = tokens[1]; if (token) { p->vid_hdr_offs = bytes_str_to_int(token); if (p->vid_hdr_offs < 0) return p->vid_hdr_offs; } token = tokens[2]; if (token) { int err = kstrtoint(token, 10, &p->max_beb_per1024); if (err) { pr_err("UBI error: bad value for max_beb_per1024 parameter: %s\n", token); return -EINVAL; } } token = tokens[3]; if (token) { int err = kstrtoint(token, 10, &p->ubi_num); if (err) { pr_err("UBI error: bad value for ubi_num parameter: %s\n", token); return -EINVAL; } } else p->ubi_num = UBI_DEV_NUM_AUTO; token = tokens[4]; if (token) { int err = kstrtoint(token, 10, &p->enable_fm); if (err) { pr_err("UBI error: bad value for enable_fm parameter: %s\n", token); return -EINVAL; } } else p->enable_fm = 0; token = tokens[5]; if (token) { int err = kstrtoint(token, 10, &p->need_resv_pool); if (err) { pr_err("UBI error: bad value for need_resv_pool parameter: %s\n", token); return -EINVAL; } } else p->need_resv_pool = 0; mtd_devs += 1; return 0; } module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 0400); MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: mtd=<name|num|path>[,<vid_hdr_offs>[,max_beb_per1024[,ubi_num]]].\n" "Multiple \"mtd\" parameters may be specified.\n" "MTD devices may be specified by their number, name, or path to the MTD character device node.\n" "Optional \"vid_hdr_offs\" parameter specifies UBI VID header position to be used by UBI. (default value if 0)\n" "Optional \"max_beb_per1024\" parameter specifies the maximum expected bad eraseblock per 1024 eraseblocks. (default value (" __stringify(CONFIG_MTD_UBI_BEB_LIMIT) ") if 0)\n" "Optional \"ubi_num\" parameter specifies UBI device number which have to be assigned to the newly created UBI device (assigned automatically by default)\n" "Optional \"enable_fm\" parameter determines whether to enable fastmap during attach. If the value is non-zero, fastmap is enabled. Default value is 0.\n" "Optional \"need_resv_pool\" parameter determines whether to reserve pool->max_size pebs during attach. If the value is non-zero, peb reservation is enabled. Default value is 0.\n" "\n" "Example 1: mtd=/dev/mtd0 - attach MTD device /dev/mtd0.\n" "Example 2: mtd=content,1984 mtd=4 - attach MTD device with name \"content\" using VID header offset 1984, and MTD device number 4 with default VID header offset.\n" "Example 3: mtd=/dev/mtd1,0,25 - attach MTD device /dev/mtd1 using default VID header offset and reserve 25*nand_size_in_blocks/1024 erase blocks for bad block handling.\n" "Example 4: mtd=/dev/mtd1,0,0,5 - attach MTD device /dev/mtd1 to UBI 5 and using default values for the other fields.\n" "example 5: mtd=1,0,0,5 mtd=2,0,0,6,1 - attach MTD device /dev/mtd1 to UBI 5 and disable fastmap; attach MTD device /dev/mtd2 to UBI 6 and enable fastmap.(only works when fastmap is enabled and fm_autoconvert=Y).\n" "\t(e.g. if the NAND *chipset* has 4096 PEB, 100 will be reserved for this UBI device)."); #ifdef CONFIG_MTD_UBI_FASTMAP module_param(fm_autoconvert, bool, 0644); MODULE_PARM_DESC(fm_autoconvert, "Set this parameter to enable fastmap automatically on images without a fastmap."); module_param(fm_debug, bool, 0); MODULE_PARM_DESC(fm_debug, "Set this parameter to enable fastmap debugging by default. Warning, this will make fastmap slow!"); #endif MODULE_VERSION(__stringify(UBI_VERSION)); MODULE_DESCRIPTION("UBI - Unsorted Block Images"); MODULE_AUTHOR("Artem Bityutskiy"); MODULE_LICENSE("GPL"); |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | /* * mtdram - a test mtd device * Author: Alexander Larsson <alex@cendio.se> * * Copyright (c) 1999 Alexander Larsson <alex@cendio.se> * Copyright (c) 2005 Joern Engel <joern@wh.fh-wedel.de> * * This code is GPL * */ #include <linux/module.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/mtd/mtd.h> #include <linux/mtd/mtdram.h> static unsigned long total_size = CONFIG_MTDRAM_TOTAL_SIZE; static unsigned long erase_size = CONFIG_MTDRAM_ERASE_SIZE; static unsigned long writebuf_size = 64; #define MTDRAM_TOTAL_SIZE (total_size * 1024) #define MTDRAM_ERASE_SIZE (erase_size * 1024) module_param(total_size, ulong, 0); MODULE_PARM_DESC(total_size, "Total device size in KiB"); module_param(erase_size, ulong, 0); MODULE_PARM_DESC(erase_size, "Device erase block size in KiB"); module_param(writebuf_size, ulong, 0); MODULE_PARM_DESC(writebuf_size, "Device write buf size in Bytes (Default: 64)"); // We could store these in the mtd structure, but we only support 1 device.. static struct mtd_info *mtd_info; static int check_offs_len(struct mtd_info *mtd, loff_t ofs, uint64_t len) { int ret = 0; /* Start address must align on block boundary */ if (mtd_mod_by_eb(ofs, mtd)) { pr_debug("%s: unaligned address\n", __func__); ret = -EINVAL; } /* Length must align on block boundary */ if (mtd_mod_by_eb(len, mtd)) { pr_debug("%s: length not block aligned\n", __func__); ret = -EINVAL; } return ret; } static int ram_erase(struct mtd_info *mtd, struct erase_info *instr) { if (check_offs_len(mtd, instr->addr, instr->len)) return -EINVAL; memset((char *)mtd->priv + instr->addr, 0xff, instr->len); return 0; } static int ram_point(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, void **virt, resource_size_t *phys) { *virt = mtd->priv + from; *retlen = len; if (phys) { /* limit retlen to the number of contiguous physical pages */ unsigned long page_ofs = offset_in_page(*virt); void *addr = *virt - page_ofs; unsigned long pfn1, pfn0 = vmalloc_to_pfn(addr); *phys = __pfn_to_phys(pfn0) + page_ofs; len += page_ofs; while (len > PAGE_SIZE) { len -= PAGE_SIZE; addr += PAGE_SIZE; pfn0++; pfn1 = vmalloc_to_pfn(addr); if (pfn1 != pfn0) { *retlen = addr - *virt; break; } } } return 0; } static int ram_unpoint(struct mtd_info *mtd, loff_t from, size_t len) { return 0; } static int ram_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { memcpy(buf, mtd->priv + from, len); *retlen = len; return 0; } static int ram_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf) { memcpy((char *)mtd->priv + to, buf, len); *retlen = len; return 0; } static void __exit cleanup_mtdram(void) { if (mtd_info) { mtd_device_unregister(mtd_info); vfree(mtd_info->priv); kfree(mtd_info); } } int mtdram_init_device(struct mtd_info *mtd, void *mapped_address, unsigned long size, const char *name) { memset(mtd, 0, sizeof(*mtd)); /* Setup the MTD structure */ mtd->name = name; mtd->type = MTD_RAM; mtd->flags = MTD_CAP_RAM; mtd->size = size; mtd->writesize = 1; mtd->writebufsize = writebuf_size; mtd->erasesize = MTDRAM_ERASE_SIZE; mtd->priv = mapped_address; mtd->owner = THIS_MODULE; mtd->_erase = ram_erase; mtd->_point = ram_point; mtd->_unpoint = ram_unpoint; mtd->_read = ram_read; mtd->_write = ram_write; if (mtd_device_register(mtd, NULL, 0)) return -EIO; return 0; } static int __init init_mtdram(void) { void *addr; int err; if (!total_size) return -EINVAL; /* Allocate some memory */ mtd_info = kmalloc(sizeof(struct mtd_info), GFP_KERNEL); if (!mtd_info) return -ENOMEM; addr = vmalloc(MTDRAM_TOTAL_SIZE); if (!addr) { kfree(mtd_info); mtd_info = NULL; return -ENOMEM; } err = mtdram_init_device(mtd_info, addr, MTDRAM_TOTAL_SIZE, "mtdram test device"); if (err) { vfree(addr); kfree(mtd_info); mtd_info = NULL; return err; } memset(mtd_info->priv, 0xff, MTDRAM_TOTAL_SIZE); return err; } module_init(init_mtdram); module_exit(cleanup_mtdram); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Alexander Larsson <alexl@redhat.com>"); MODULE_DESCRIPTION("Simulated MTD driver for testing"); |
| 24 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/crc32c.h> #include <linux/ktime.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "glops.h" #include "log.h" #include "lops.h" #include "meta_io.h" #include "recovery.h" #include "super.h" #include "util.h" #include "dir.h" struct workqueue_struct *gfs2_recovery_wq; int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_glock *gl = ip->i_gl; u64 dblock; u32 extlen; int error; extlen = 32; error = gfs2_get_extent(&ip->i_inode, blk, &dblock, &extlen); if (error) return error; if (!dblock) { gfs2_consist_inode(ip); return -EIO; } *bh = gfs2_meta_ra(gl, dblock, extlen); return error; } int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr = NULL, *iter; list_for_each_entry(iter, head, rr_list) { if (iter->rr_blkno == blkno) { rr = iter; break; } } if (rr) { rr->rr_where = where; return 0; } rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS); if (!rr) return -ENOMEM; rr->rr_blkno = blkno; rr->rr_where = where; list_add(&rr->rr_list, head); return 1; } int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) { struct gfs2_revoke_replay *rr = NULL, *iter; int wrap, a, b, revoke; list_for_each_entry(iter, &jd->jd_revoke_list, rr_list) { if (iter->rr_blkno == blkno) { rr = iter; break; } } if (!rr) return 0; wrap = (rr->rr_where < jd->jd_replay_tail); a = (jd->jd_replay_tail < where); b = (where < rr->rr_where); revoke = (wrap) ? (a || b) : (a && b); return revoke; } void gfs2_revoke_clean(struct gfs2_jdesc *jd) { struct list_head *head = &jd->jd_revoke_list; struct gfs2_revoke_replay *rr; while (!list_empty(head)) { rr = list_first_entry(head, struct gfs2_revoke_replay, rr_list); list_del(&rr->rr_list); kfree(rr); } } int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, unsigned int blkno, struct gfs2_log_header_host *head) { u32 hash, crc; if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) return 1; hash = crc32(~0, lh, LH_V1_SIZE - 4); hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ if (be32_to_cpu(lh->lh_hash) != hash) return 1; crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) return 1; head->lh_sequence = be64_to_cpu(lh->lh_sequence); head->lh_flags = be32_to_cpu(lh->lh_flags); head->lh_tail = be32_to_cpu(lh->lh_tail); head->lh_blkno = be32_to_cpu(lh->lh_blkno); head->lh_local_total = be64_to_cpu(lh->lh_local_total); head->lh_local_free = be64_to_cpu(lh->lh_local_free); head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); return 0; } /** * get_log_header - read the log header for a given segment * @jd: the journal * @blk: the block to look at * @head: the log header to return * * Read the log header for a given segement in a given journal. Do a few * sanity checks on it. * * Returns: 0 on success, * 1 if the header was invalid or incomplete, * errno on error */ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, blk, head); brelse(bh); return error; } /** * foreach_descriptor - go through the active part of the log * @jd: the journal * @start: the first log header in the active region * @end: the last log header (don't process the contents of this entry)) * @pass: iteration number (foreach_descriptor() is called in a for() loop) * * Call a given function once for every log descriptor in the active * portion of the log. * * Returns: errno */ static int foreach_descriptor(struct gfs2_jdesc *jd, u32 start, unsigned int end, int pass) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; struct gfs2_log_descriptor *ld; int error = 0; u32 length; __be64 *ptr; unsigned int offset = sizeof(struct gfs2_log_descriptor); offset += sizeof(__be64) - 1; offset &= ~(sizeof(__be64) - 1); while (start != end) { error = gfs2_replay_read_block(jd, start, &bh); if (error) return error; if (gfs2_meta_check(sdp, bh)) { brelse(bh); return -EIO; } ld = (struct gfs2_log_descriptor *)bh->b_data; length = be32_to_cpu(ld->ld_length); if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) { struct gfs2_log_header_host lh; error = get_log_header(jd, start, &lh); if (!error) { gfs2_replay_incr_blk(jd, &start); brelse(bh); continue; } if (error == 1) { gfs2_consist_inode(GFS2_I(jd->jd_inode)); error = -EIO; } brelse(bh); return error; } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) { brelse(bh); return -EIO; } ptr = (__be64 *)(bh->b_data + offset); error = lops_scan_elements(jd, start, ld, ptr, pass); if (error) { brelse(bh); return error; } while (length--) gfs2_replay_incr_blk(jd, &start); brelse(bh); } return 0; } /** * clean_journal - mark a dirty journal as being clean * @jd: the journal * @head: the head journal to start from * * Returns: errno */ static void clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); u32 lblock = head->lh_blkno; gfs2_replay_incr_blk(jd, &lblock); gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock, GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY, REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC); if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { sdp->sd_log_flush_head = lblock; gfs2_log_incr_head(sdp); } } static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, unsigned int message) { char env_jid[20]; char env_status[20]; char *envp[] = { env_jid, env_status, NULL }; struct lm_lockstruct *ls = &sdp->sd_lockstruct; ls->ls_recover_jid_done = jid; ls->ls_recover_jid_status = message; sprintf(env_jid, "JID=%u", jid); sprintf(env_status, "RECOVERY=%s", message == LM_RD_SUCCESS ? "Done" : "Failed"); kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); if (sdp->sd_lockstruct.ls_ops->lm_recovery_result) sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } /** * update_statfs_inode - Update the master statfs inode or zero out the local * statfs inode for a given journal. * @jd: The journal * @head: If NULL, @inode is the local statfs inode and we need to zero it out. * Otherwise, it @head contains the statfs change info that needs to be * synced to the master statfs inode (pointed to by @inode). * @inode: statfs inode to update. */ static int update_statfs_inode(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_inode *ip; struct buffer_head *bh; struct gfs2_statfs_change_host sc; int error = 0; BUG_ON(!inode); ip = GFS2_I(inode); error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out; spin_lock(&sdp->sd_statfs_spin); if (head) { /* Update the master statfs inode */ gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); sc.sc_total += head->lh_local_total; sc.sc_free += head->lh_local_free; sc.sc_dinodes += head->lh_local_dinodes; gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " "Free:%lld, Dinodes:%lld after change " "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, sc.sc_free, sc.sc_dinodes, head->lh_local_total, head->lh_local_free, head->lh_local_dinodes); } else { /* Zero out the local statfs inode */ memset(bh->b_data + sizeof(struct gfs2_dinode), 0, sizeof(struct gfs2_statfs_change)); /* If it's our own journal, reset any in-memory changes too */ if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { memset(&sdp->sd_statfs_local, 0, sizeof(struct gfs2_statfs_change_host)); } } spin_unlock(&sdp->sd_statfs_spin); mark_buffer_dirty(bh); brelse(bh); gfs2_inode_metasync(ip->i_gl); out: return error; } /** * recover_local_statfs - Update the master and local statfs changes for this * journal. * * Previously, statfs updates would be read in from the local statfs inode and * synced to the master statfs inode during recovery. * * We now use the statfs updates in the journal head to update the master statfs * inode instead of reading in from the local statfs inode. To preserve backward * compatibility with kernels that can't do this, we still need to keep the * local statfs inode up to date by writing changes to it. At some point in the * future, we can do away with the local statfs inodes altogether and keep the * statfs changes solely in the journal. * * @jd: the journal * @head: the journal head * * Returns: errno */ static void recover_local_statfs(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) { int error; struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (!head->lh_local_total && !head->lh_local_free && !head->lh_local_dinodes) /* No change */ goto zero_local; /* First update the master statfs inode with the changes we * found in the journal. */ error = update_statfs_inode(jd, head, sdp->sd_statfs_inode); if (error) goto out; zero_local: /* Zero out the local statfs inode so any changes in there * are not re-recovered. */ error = update_statfs_inode(jd, NULL, find_local_statfs_inode(sdp, jd->jd_jid)); out: return; } void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; struct gfs2_holder j_gh, ji_gh; ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; int ro = 0; unsigned int pass; int error = 0; int jlocked = 0; if (gfs2_withdrawing_or_withdrawn(sdp)) { fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n", jd->jd_jid); goto fail; } t_start = ktime_get(); if (sdp->sd_args.ar_spectator) goto fail; if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { fs_info(sdp, "jid=%u: Trying to acquire journal glock...\n", jd->jd_jid); jlocked = 1; /* Acquire the journal glock so we can do recovery */ error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE, &j_gh); switch (error) { case 0: break; case GLR_TRYFAILED: fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid); error = 0; goto fail; default: goto fail; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh); if (error) goto fail_gunlock_j; } else { fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); } t_jlck = ktime_get(); fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); error = gfs2_jdesc_check(jd); if (error) goto fail_gunlock_ji; error = gfs2_find_jhead(jd, &head, true); if (error) goto fail_gunlock_ji; t_jhd = ktime_get(); fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid, ktime_ms_delta(t_jhd, t_jlck)); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { mutex_lock(&sdp->sd_freeze_mutex); if (test_bit(SDF_FROZEN, &sdp->sd_flags)) { mutex_unlock(&sdp->sd_freeze_mutex); fs_warn(sdp, "jid=%u: Can't replay: filesystem " "is frozen\n", jd->jd_jid); goto fail_gunlock_ji; } if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { ro = 1; } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) ro = 1; } else { if (sb_rdonly(sdp->sd_vfs)) { /* check if device itself is read-only */ ro = bdev_read_only(sdp->sd_vfs->s_bdev); if (!ro) { fs_info(sdp, "recovery required on " "read-only filesystem.\n"); fs_info(sdp, "write access will be " "enabled during recovery.\n"); } } } if (ro) { fs_warn(sdp, "jid=%u: Can't replay: read-only block " "device\n", jd->jd_jid); error = -EROFS; goto fail_gunlock_nofreeze; } t_tlck = ktime_get(); fs_info(sdp, "jid=%u: Replaying journal...0x%x to 0x%x\n", jd->jd_jid, head.lh_tail, head.lh_blkno); /* We take the sd_log_flush_lock here primarily to prevent log * flushes and simultaneous journal replays from stomping on * each other wrt jd_log_bio. */ down_read(&sdp->sd_log_flush_lock); for (pass = 0; pass < 2; pass++) { lops_before_scan(jd, &head, pass); error = foreach_descriptor(jd, head.lh_tail, head.lh_blkno, pass); lops_after_scan(jd, error, pass); if (error) { up_read(&sdp->sd_log_flush_lock); goto fail_gunlock_nofreeze; } } recover_local_statfs(jd, &head); clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); mutex_unlock(&sdp->sd_freeze_mutex); t_rep = ktime_get(); fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", jd->jd_jid, ktime_ms_delta(t_rep, t_start), ktime_ms_delta(t_jlck, t_start), ktime_ms_delta(t_jhd, t_jlck), ktime_ms_delta(t_tlck, t_jhd), ktime_ms_delta(t_rep, t_tlck)); } gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); gfs2_glock_dq_uninit(&j_gh); } fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); goto done; fail_gunlock_nofreeze: mutex_unlock(&sdp->sd_freeze_mutex); fail_gunlock_ji: if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); fail_gunlock_j: gfs2_glock_dq_uninit(&j_gh); } fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); fail: jd->jd_recover_error = error; gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); done: clear_bit(JDF_RECOVERY, &jd->jd_flags); smp_mb__after_atomic(); wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) return -EBUSY; /* we have JDF_RECOVERY, queue should always succeed */ rv = queue_work(gfs2_recovery_wq, &jd->jd_work); BUG_ON(!rv); if (wait) wait_on_bit(&jd->jd_flags, JDF_RECOVERY, TASK_UNINTERRUPTIBLE); return wait ? jd->jd_recover_error : 0; } |
| 1 1 1 1 2 2 1 2 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 | /* * Created: Fri Jan 19 10:48:35 2001 by faith@acm.org * * Copyright 2001 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Author Rickard E. (Rik) Faith <faith@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include <linux/debugfs.h> #include <linux/fs.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/slab.h> #include <linux/srcu.h> #include <drm/drm_accel.h> #include <drm/drm_cache.h> #include <drm/drm_client.h> #include <drm/drm_color_mgmt.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_managed.h> #include <drm/drm_mode_object.h> #include <drm/drm_panic.h> #include <drm/drm_print.h> #include <drm/drm_privacy_screen_machine.h> #include "drm_crtc_internal.h" #include "drm_internal.h" MODULE_AUTHOR("Gareth Hughes, Leif Delgass, José Fonseca, Jon Smirl"); MODULE_DESCRIPTION("DRM shared core routines"); MODULE_LICENSE("GPL and additional rights"); static DEFINE_SPINLOCK(drm_minor_lock); static struct idr drm_minors_idr; /* * If the drm core fails to init for whatever reason, * we should prevent any drivers from registering with it. * It's best to check this at drm_dev_init(), as some drivers * prefer to embed struct drm_device into their own device * structure and call drm_dev_init() themselves. */ static bool drm_core_init_complete; static struct dentry *drm_debugfs_root; DEFINE_STATIC_SRCU(drm_unplug_srcu); /* * DRM Minors * A DRM device can provide several char-dev interfaces on the DRM-Major. Each * of them is represented by a drm_minor object. Depending on the capabilities * of the device-driver, different interfaces are registered. * * Minors can be accessed via dev->$minor_name. This pointer is either * NULL or a valid drm_minor pointer and stays valid as long as the device is * valid. This means, DRM minors have the same life-time as the underlying * device. However, this doesn't mean that the minor is active. Minors are * registered and unregistered dynamically according to device-state. */ static struct drm_minor **drm_minor_get_slot(struct drm_device *dev, enum drm_minor_type type) { switch (type) { case DRM_MINOR_PRIMARY: return &dev->primary; case DRM_MINOR_RENDER: return &dev->render; case DRM_MINOR_ACCEL: return &dev->accel; default: BUG(); } } static void drm_minor_alloc_release(struct drm_device *dev, void *data) { struct drm_minor *minor = data; unsigned long flags; WARN_ON(dev != minor->dev); put_device(minor->kdev); if (minor->type == DRM_MINOR_ACCEL) { accel_minor_remove(minor->index); } else { spin_lock_irqsave(&drm_minor_lock, flags); idr_remove(&drm_minors_idr, minor->index); spin_unlock_irqrestore(&drm_minor_lock, flags); } } static int drm_minor_alloc(struct drm_device *dev, enum drm_minor_type type) { struct drm_minor *minor; unsigned long flags; int r; minor = drmm_kzalloc(dev, sizeof(*minor), GFP_KERNEL); if (!minor) return -ENOMEM; minor->type = type; minor->dev = dev; idr_preload(GFP_KERNEL); if (type == DRM_MINOR_ACCEL) { r = accel_minor_alloc(); } else { spin_lock_irqsave(&drm_minor_lock, flags); r = idr_alloc(&drm_minors_idr, NULL, 64 * type, 64 * (type + 1), GFP_NOWAIT); spin_unlock_irqrestore(&drm_minor_lock, flags); } idr_preload_end(); if (r < 0) return r; minor->index = r; r = drmm_add_action_or_reset(dev, drm_minor_alloc_release, minor); if (r) return r; minor->kdev = drm_sysfs_minor_alloc(minor); if (IS_ERR(minor->kdev)) return PTR_ERR(minor->kdev); *drm_minor_get_slot(dev, type) = minor; return 0; } static int drm_minor_register(struct drm_device *dev, enum drm_minor_type type) { struct drm_minor *minor; unsigned long flags; int ret; DRM_DEBUG("\n"); minor = *drm_minor_get_slot(dev, type); if (!minor) return 0; if (minor->type != DRM_MINOR_ACCEL) { ret = drm_debugfs_register(minor, minor->index, drm_debugfs_root); if (ret) { DRM_ERROR("DRM: Failed to initialize /sys/kernel/debug/dri.\n"); goto err_debugfs; } } ret = device_add(minor->kdev); if (ret) goto err_debugfs; /* replace NULL with @minor so lookups will succeed from now on */ if (minor->type == DRM_MINOR_ACCEL) { accel_minor_replace(minor, minor->index); } else { spin_lock_irqsave(&drm_minor_lock, flags); idr_replace(&drm_minors_idr, minor, minor->index); spin_unlock_irqrestore(&drm_minor_lock, flags); } DRM_DEBUG("new minor registered %d\n", minor->index); return 0; err_debugfs: drm_debugfs_unregister(minor); return ret; } static void drm_minor_unregister(struct drm_device *dev, enum drm_minor_type type) { struct drm_minor *minor; unsigned long flags; minor = *drm_minor_get_slot(dev, type); if (!minor || !device_is_registered(minor->kdev)) return; /* replace @minor with NULL so lookups will fail from now on */ if (minor->type == DRM_MINOR_ACCEL) { accel_minor_replace(NULL, minor->index); } else { spin_lock_irqsave(&drm_minor_lock, flags); idr_replace(&drm_minors_idr, NULL, minor->index); spin_unlock_irqrestore(&drm_minor_lock, flags); } device_del(minor->kdev); dev_set_drvdata(minor->kdev, NULL); /* safety belt */ drm_debugfs_unregister(minor); } /* * Looks up the given minor-ID and returns the respective DRM-minor object. The * refence-count of the underlying device is increased so you must release this * object with drm_minor_release(). * * As long as you hold this minor, it is guaranteed that the object and the * minor->dev pointer will stay valid! However, the device may get unplugged and * unregistered while you hold the minor. */ struct drm_minor *drm_minor_acquire(unsigned int minor_id) { struct drm_minor *minor; unsigned long flags; spin_lock_irqsave(&drm_minor_lock, flags); minor = idr_find(&drm_minors_idr, minor_id); if (minor) drm_dev_get(minor->dev); spin_unlock_irqrestore(&drm_minor_lock, flags); if (!minor) { return ERR_PTR(-ENODEV); } else if (drm_dev_is_unplugged(minor->dev)) { drm_dev_put(minor->dev); return ERR_PTR(-ENODEV); } return minor; } void drm_minor_release(struct drm_minor *minor) { drm_dev_put(minor->dev); } /** * DOC: driver instance overview * * A device instance for a drm driver is represented by &struct drm_device. This * is allocated and initialized with devm_drm_dev_alloc(), usually from * bus-specific ->probe() callbacks implemented by the driver. The driver then * needs to initialize all the various subsystems for the drm device like memory * management, vblank handling, modesetting support and initial output * configuration plus obviously initialize all the corresponding hardware bits. * Finally when everything is up and running and ready for userspace the device * instance can be published using drm_dev_register(). * * There is also deprecated support for initializing device instances using * bus-specific helpers and the &drm_driver.load callback. But due to * backwards-compatibility needs the device instance have to be published too * early, which requires unpretty global locking to make safe and is therefore * only support for existing drivers not yet converted to the new scheme. * * When cleaning up a device instance everything needs to be done in reverse: * First unpublish the device instance with drm_dev_unregister(). Then clean up * any other resources allocated at device initialization and drop the driver's * reference to &drm_device using drm_dev_put(). * * Note that any allocation or resource which is visible to userspace must be * released only when the final drm_dev_put() is called, and not when the * driver is unbound from the underlying physical struct &device. Best to use * &drm_device managed resources with drmm_add_action(), drmm_kmalloc() and * related functions. * * devres managed resources like devm_kmalloc() can only be used for resources * directly related to the underlying hardware device, and only used in code * paths fully protected by drm_dev_enter() and drm_dev_exit(). * * Display driver example * ~~~~~~~~~~~~~~~~~~~~~~ * * The following example shows a typical structure of a DRM display driver. * The example focus on the probe() function and the other functions that is * almost always present and serves as a demonstration of devm_drm_dev_alloc(). * * .. code-block:: c * * struct driver_device { * struct drm_device drm; * void *userspace_facing; * struct clk *pclk; * }; * * static const struct drm_driver driver_drm_driver = { * [...] * }; * * static int driver_probe(struct platform_device *pdev) * { * struct driver_device *priv; * struct drm_device *drm; * int ret; * * priv = devm_drm_dev_alloc(&pdev->dev, &driver_drm_driver, * struct driver_device, drm); * if (IS_ERR(priv)) * return PTR_ERR(priv); * drm = &priv->drm; * * ret = drmm_mode_config_init(drm); * if (ret) * return ret; * * priv->userspace_facing = drmm_kzalloc(..., GFP_KERNEL); * if (!priv->userspace_facing) * return -ENOMEM; * * priv->pclk = devm_clk_get(dev, "PCLK"); * if (IS_ERR(priv->pclk)) * return PTR_ERR(priv->pclk); * * // Further setup, display pipeline etc * * platform_set_drvdata(pdev, drm); * * drm_mode_config_reset(drm); * * ret = drm_dev_register(drm); * if (ret) * return ret; * * drm_fbdev_{...}_setup(drm, 32); * * return 0; * } * * // This function is called before the devm_ resources are released * static int driver_remove(struct platform_device *pdev) * { * struct drm_device *drm = platform_get_drvdata(pdev); * * drm_dev_unregister(drm); * drm_atomic_helper_shutdown(drm) * * return 0; * } * * // This function is called on kernel restart and shutdown * static void driver_shutdown(struct platform_device *pdev) * { * drm_atomic_helper_shutdown(platform_get_drvdata(pdev)); * } * * static int __maybe_unused driver_pm_suspend(struct device *dev) * { * return drm_mode_config_helper_suspend(dev_get_drvdata(dev)); * } * * static int __maybe_unused driver_pm_resume(struct device *dev) * { * drm_mode_config_helper_resume(dev_get_drvdata(dev)); * * return 0; * } * * static const struct dev_pm_ops driver_pm_ops = { * SET_SYSTEM_SLEEP_PM_OPS(driver_pm_suspend, driver_pm_resume) * }; * * static struct platform_driver driver_driver = { * .driver = { * [...] * .pm = &driver_pm_ops, * }, * .probe = driver_probe, * .remove = driver_remove, * .shutdown = driver_shutdown, * }; * module_platform_driver(driver_driver); * * Drivers that want to support device unplugging (USB, DT overlay unload) should * use drm_dev_unplug() instead of drm_dev_unregister(). The driver must protect * regions that is accessing device resources to prevent use after they're * released. This is done using drm_dev_enter() and drm_dev_exit(). There is one * shortcoming however, drm_dev_unplug() marks the drm_device as unplugged before * drm_atomic_helper_shutdown() is called. This means that if the disable code * paths are protected, they will not run on regular driver module unload, * possibly leaving the hardware enabled. */ /** * drm_put_dev - Unregister and release a DRM device * @dev: DRM device * * Called at module unload time or when a PCI device is unplugged. * * Cleans up all DRM device, calling drm_lastclose(). * * Note: Use of this function is deprecated. It will eventually go away * completely. Please use drm_dev_unregister() and drm_dev_put() explicitly * instead to make sure that the device isn't userspace accessible any more * while teardown is in progress, ensuring that userspace can't access an * inconsistent state. */ void drm_put_dev(struct drm_device *dev) { DRM_DEBUG("\n"); if (!dev) { DRM_ERROR("cleanup called no dev\n"); return; } drm_dev_unregister(dev); drm_dev_put(dev); } EXPORT_SYMBOL(drm_put_dev); /** * drm_dev_enter - Enter device critical section * @dev: DRM device * @idx: Pointer to index that will be passed to the matching drm_dev_exit() * * This function marks and protects the beginning of a section that should not * be entered after the device has been unplugged. The section end is marked * with drm_dev_exit(). Calls to this function can be nested. * * Returns: * True if it is OK to enter the section, false otherwise. */ bool drm_dev_enter(struct drm_device *dev, int *idx) { *idx = srcu_read_lock(&drm_unplug_srcu); if (dev->unplugged) { srcu_read_unlock(&drm_unplug_srcu, *idx); return false; } return true; } EXPORT_SYMBOL(drm_dev_enter); /** * drm_dev_exit - Exit device critical section * @idx: index returned from drm_dev_enter() * * This function marks the end of a section that should not be entered after * the device has been unplugged. */ void drm_dev_exit(int idx) { srcu_read_unlock(&drm_unplug_srcu, idx); } EXPORT_SYMBOL(drm_dev_exit); /** * drm_dev_unplug - unplug a DRM device * @dev: DRM device * * This unplugs a hotpluggable DRM device, which makes it inaccessible to * userspace operations. Entry-points can use drm_dev_enter() and * drm_dev_exit() to protect device resources in a race free manner. This * essentially unregisters the device like drm_dev_unregister(), but can be * called while there are still open users of @dev. */ void drm_dev_unplug(struct drm_device *dev) { /* * After synchronizing any critical read section is guaranteed to see * the new value of ->unplugged, and any critical section which might * still have seen the old value of ->unplugged is guaranteed to have * finished. */ dev->unplugged = true; synchronize_srcu(&drm_unplug_srcu); drm_dev_unregister(dev); /* Clear all CPU mappings pointing to this device */ unmap_mapping_range(dev->anon_inode->i_mapping, 0, 0, 1); } EXPORT_SYMBOL(drm_dev_unplug); /* * DRM internal mount * We want to be able to allocate our own "struct address_space" to control * memory-mappings in VRAM (or stolen RAM, ...). However, core MM does not allow * stand-alone address_space objects, so we need an underlying inode. As there * is no way to allocate an independent inode easily, we need a fake internal * VFS mount-point. * * The drm_fs_inode_new() function allocates a new inode, drm_fs_inode_free() * frees it again. You are allowed to use iget() and iput() to get references to * the inode. But each drm_fs_inode_new() call must be paired with exactly one * drm_fs_inode_free() call (which does not have to be the last iput()). * We use drm_fs_inode_*() to manage our internal VFS mount-point and share it * between multiple inode-users. You could, technically, call * iget() + drm_fs_inode_free() directly after alloc and sometime later do an * iput(), but this way you'd end up with a new vfsmount for each inode. */ static int drm_fs_cnt; static struct vfsmount *drm_fs_mnt; static int drm_fs_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, 0x010203ff) ? 0 : -ENOMEM; } static struct file_system_type drm_fs_type = { .name = "drm", .owner = THIS_MODULE, .init_fs_context = drm_fs_init_fs_context, .kill_sb = kill_anon_super, }; static struct inode *drm_fs_inode_new(void) { struct inode *inode; int r; r = simple_pin_fs(&drm_fs_type, &drm_fs_mnt, &drm_fs_cnt); if (r < 0) { DRM_ERROR("Cannot mount pseudo fs: %d\n", r); return ERR_PTR(r); } inode = alloc_anon_inode(drm_fs_mnt->mnt_sb); if (IS_ERR(inode)) simple_release_fs(&drm_fs_mnt, &drm_fs_cnt); return inode; } static void drm_fs_inode_free(struct inode *inode) { if (inode) { iput(inode); simple_release_fs(&drm_fs_mnt, &drm_fs_cnt); } } /** * DOC: component helper usage recommendations * * DRM drivers that drive hardware where a logical device consists of a pile of * independent hardware blocks are recommended to use the :ref:`component helper * library<component>`. For consistency and better options for code reuse the * following guidelines apply: * * - The entire device initialization procedure should be run from the * &component_master_ops.master_bind callback, starting with * devm_drm_dev_alloc(), then binding all components with * component_bind_all() and finishing with drm_dev_register(). * * - The opaque pointer passed to all components through component_bind_all() * should point at &struct drm_device of the device instance, not some driver * specific private structure. * * - The component helper fills the niche where further standardization of * interfaces is not practical. When there already is, or will be, a * standardized interface like &drm_bridge or &drm_panel, providing its own * functions to find such components at driver load time, like * drm_of_find_panel_or_bridge(), then the component helper should not be * used. */ static void drm_dev_init_release(struct drm_device *dev, void *res) { drm_fs_inode_free(dev->anon_inode); put_device(dev->dev); /* Prevent use-after-free in drm_managed_release when debugging is * enabled. Slightly awkward, but can't really be helped. */ dev->dev = NULL; mutex_destroy(&dev->master_mutex); mutex_destroy(&dev->clientlist_mutex); mutex_destroy(&dev->filelist_mutex); mutex_destroy(&dev->struct_mutex); } static int drm_dev_init(struct drm_device *dev, const struct drm_driver *driver, struct device *parent) { struct inode *inode; int ret; if (!drm_core_init_complete) { DRM_ERROR("DRM core is not initialized\n"); return -ENODEV; } if (WARN_ON(!parent)) return -EINVAL; kref_init(&dev->ref); dev->dev = get_device(parent); dev->driver = driver; INIT_LIST_HEAD(&dev->managed.resources); spin_lock_init(&dev->managed.lock); /* no per-device feature limits by default */ dev->driver_features = ~0u; if (drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL) && (drm_core_check_feature(dev, DRIVER_RENDER) || drm_core_check_feature(dev, DRIVER_MODESET))) { DRM_ERROR("DRM driver can't be both a compute acceleration and graphics driver\n"); return -EINVAL; } INIT_LIST_HEAD(&dev->filelist); INIT_LIST_HEAD(&dev->filelist_internal); INIT_LIST_HEAD(&dev->clientlist); INIT_LIST_HEAD(&dev->vblank_event_list); spin_lock_init(&dev->event_lock); mutex_init(&dev->struct_mutex); mutex_init(&dev->filelist_mutex); mutex_init(&dev->clientlist_mutex); mutex_init(&dev->master_mutex); raw_spin_lock_init(&dev->mode_config.panic_lock); ret = drmm_add_action_or_reset(dev, drm_dev_init_release, NULL); if (ret) return ret; inode = drm_fs_inode_new(); if (IS_ERR(inode)) { ret = PTR_ERR(inode); DRM_ERROR("Cannot allocate anonymous inode: %d\n", ret); goto err; } dev->anon_inode = inode; if (drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) { ret = drm_minor_alloc(dev, DRM_MINOR_ACCEL); if (ret) goto err; } else { if (drm_core_check_feature(dev, DRIVER_RENDER)) { ret = drm_minor_alloc(dev, DRM_MINOR_RENDER); if (ret) goto err; } ret = drm_minor_alloc(dev, DRM_MINOR_PRIMARY); if (ret) goto err; } if (drm_core_check_feature(dev, DRIVER_GEM)) { ret = drm_gem_init(dev); if (ret) { DRM_ERROR("Cannot initialize graphics execution manager (GEM)\n"); goto err; } } dev->unique = drmm_kstrdup(dev, dev_name(parent), GFP_KERNEL); if (!dev->unique) { ret = -ENOMEM; goto err; } if (drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) accel_debugfs_init(dev); else drm_debugfs_dev_init(dev, drm_debugfs_root); return 0; err: drm_managed_release(dev); return ret; } static void devm_drm_dev_init_release(void *data) { drm_dev_put(data); } static int devm_drm_dev_init(struct device *parent, struct drm_device *dev, const struct drm_driver *driver) { int ret; ret = drm_dev_init(dev, driver, parent); if (ret) return ret; return devm_add_action_or_reset(parent, devm_drm_dev_init_release, dev); } void *__devm_drm_dev_alloc(struct device *parent, const struct drm_driver *driver, size_t size, size_t offset) { void *container; struct drm_device *drm; int ret; container = kzalloc(size, GFP_KERNEL); if (!container) return ERR_PTR(-ENOMEM); drm = container + offset; ret = devm_drm_dev_init(parent, drm, driver); if (ret) { kfree(container); return ERR_PTR(ret); } drmm_add_final_kfree(drm, container); return container; } EXPORT_SYMBOL(__devm_drm_dev_alloc); /** * drm_dev_alloc - Allocate new DRM device * @driver: DRM driver to allocate device for * @parent: Parent device object * * This is the deprecated version of devm_drm_dev_alloc(), which does not support * subclassing through embedding the struct &drm_device in a driver private * structure, and which does not support automatic cleanup through devres. * * RETURNS: * Pointer to new DRM device, or ERR_PTR on failure. */ struct drm_device *drm_dev_alloc(const struct drm_driver *driver, struct device *parent) { struct drm_device *dev; int ret; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); ret = drm_dev_init(dev, driver, parent); if (ret) { kfree(dev); return ERR_PTR(ret); } drmm_add_final_kfree(dev, dev); return dev; } EXPORT_SYMBOL(drm_dev_alloc); static void drm_dev_release(struct kref *ref) { struct drm_device *dev = container_of(ref, struct drm_device, ref); /* Just in case register/unregister was never called */ drm_debugfs_dev_fini(dev); if (dev->driver->release) dev->driver->release(dev); drm_managed_release(dev); kfree(dev->managed.final_kfree); } /** * drm_dev_get - Take reference of a DRM device * @dev: device to take reference of or NULL * * This increases the ref-count of @dev by one. You *must* already own a * reference when calling this. Use drm_dev_put() to drop this reference * again. * * This function never fails. However, this function does not provide *any* * guarantee whether the device is alive or running. It only provides a * reference to the object and the memory associated with it. */ void drm_dev_get(struct drm_device *dev) { if (dev) kref_get(&dev->ref); } EXPORT_SYMBOL(drm_dev_get); /** * drm_dev_put - Drop reference of a DRM device * @dev: device to drop reference of or NULL * * This decreases the ref-count of @dev by one. The device is destroyed if the * ref-count drops to zero. */ void drm_dev_put(struct drm_device *dev) { if (dev) kref_put(&dev->ref, drm_dev_release); } EXPORT_SYMBOL(drm_dev_put); static int create_compat_control_link(struct drm_device *dev) { struct drm_minor *minor; char *name; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return 0; minor = *drm_minor_get_slot(dev, DRM_MINOR_PRIMARY); if (!minor) return 0; /* * Some existing userspace out there uses the existing of the controlD* * sysfs files to figure out whether it's a modeset driver. It only does * readdir, hence a symlink is sufficient (and the least confusing * option). Otherwise controlD* is entirely unused. * * Old controlD chardev have been allocated in the range * 64-127. */ name = kasprintf(GFP_KERNEL, "controlD%d", minor->index + 64); if (!name) return -ENOMEM; ret = sysfs_create_link(minor->kdev->kobj.parent, &minor->kdev->kobj, name); kfree(name); return ret; } static void remove_compat_control_link(struct drm_device *dev) { struct drm_minor *minor; char *name; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return; minor = *drm_minor_get_slot(dev, DRM_MINOR_PRIMARY); if (!minor) return; name = kasprintf(GFP_KERNEL, "controlD%d", minor->index + 64); if (!name) return; sysfs_remove_link(minor->kdev->kobj.parent, name); kfree(name); } /** * drm_dev_register - Register DRM device * @dev: Device to register * @flags: Flags passed to the driver's .load() function * * Register the DRM device @dev with the system, advertise device to user-space * and start normal device operation. @dev must be initialized via drm_dev_init() * previously. * * Never call this twice on any device! * * NOTE: To ensure backward compatibility with existing drivers method this * function calls the &drm_driver.load method after registering the device * nodes, creating race conditions. Usage of the &drm_driver.load methods is * therefore deprecated, drivers must perform all initialization before calling * drm_dev_register(). * * RETURNS: * 0 on success, negative error code on failure. */ int drm_dev_register(struct drm_device *dev, unsigned long flags) { const struct drm_driver *driver = dev->driver; int ret; if (!driver->load) drm_mode_config_validate(dev); WARN_ON(!dev->managed.final_kfree); if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); if (drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) accel_debugfs_register(dev); else drm_debugfs_dev_register(dev); ret = drm_minor_register(dev, DRM_MINOR_RENDER); if (ret) goto err_minors; ret = drm_minor_register(dev, DRM_MINOR_PRIMARY); if (ret) goto err_minors; ret = drm_minor_register(dev, DRM_MINOR_ACCEL); if (ret) goto err_minors; ret = create_compat_control_link(dev); if (ret) goto err_minors; dev->registered = true; if (driver->load) { ret = driver->load(dev, flags); if (ret) goto err_minors; } if (drm_core_check_feature(dev, DRIVER_MODESET)) { ret = drm_modeset_register_all(dev); if (ret) goto err_unload; } drm_panic_register(dev); DRM_INFO("Initialized %s %d.%d.%d for %s on minor %d\n", driver->name, driver->major, driver->minor, driver->patchlevel, dev->dev ? dev_name(dev->dev) : "virtual device", dev->primary ? dev->primary->index : dev->accel->index); goto out_unlock; err_unload: if (dev->driver->unload) dev->driver->unload(dev); err_minors: remove_compat_control_link(dev); drm_minor_unregister(dev, DRM_MINOR_ACCEL); drm_minor_unregister(dev, DRM_MINOR_PRIMARY); drm_minor_unregister(dev, DRM_MINOR_RENDER); out_unlock: if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); return ret; } EXPORT_SYMBOL(drm_dev_register); /** * drm_dev_unregister - Unregister DRM device * @dev: Device to unregister * * Unregister the DRM device from the system. This does the reverse of * drm_dev_register() but does not deallocate the device. The caller must call * drm_dev_put() to drop their final reference, unless it is managed with devres * (as devices allocated with devm_drm_dev_alloc() are), in which case there is * already an unwind action registered. * * A special form of unregistering for hotpluggable devices is drm_dev_unplug(), * which can be called while there are still open users of @dev. * * This should be called first in the device teardown code to make sure * userspace can't access the device instance any more. */ void drm_dev_unregister(struct drm_device *dev) { dev->registered = false; drm_panic_unregister(dev); drm_client_dev_unregister(dev); if (drm_core_check_feature(dev, DRIVER_MODESET)) drm_modeset_unregister_all(dev); if (dev->driver->unload) dev->driver->unload(dev); remove_compat_control_link(dev); drm_minor_unregister(dev, DRM_MINOR_ACCEL); drm_minor_unregister(dev, DRM_MINOR_PRIMARY); drm_minor_unregister(dev, DRM_MINOR_RENDER); drm_debugfs_dev_fini(dev); } EXPORT_SYMBOL(drm_dev_unregister); /* * DRM Core * The DRM core module initializes all global DRM objects and makes them * available to drivers. Once setup, drivers can probe their respective * devices. * Currently, core management includes: * - The "DRM-Global" key/value database * - Global ID management for connectors * - DRM major number allocation * - DRM minor management * - DRM sysfs class * - DRM debugfs root * * Furthermore, the DRM core provides dynamic char-dev lookups. For each * interface registered on a DRM device, you can request minor numbers from DRM * core. DRM core takes care of major-number management and char-dev * registration. A stub ->open() callback forwards any open() requests to the * registered minor. */ static int drm_stub_open(struct inode *inode, struct file *filp) { const struct file_operations *new_fops; struct drm_minor *minor; int err; DRM_DEBUG("\n"); minor = drm_minor_acquire(iminor(inode)); if (IS_ERR(minor)) return PTR_ERR(minor); new_fops = fops_get(minor->dev->driver->fops); if (!new_fops) { err = -ENODEV; goto out; } replace_fops(filp, new_fops); if (filp->f_op->open) err = filp->f_op->open(inode, filp); else err = 0; out: drm_minor_release(minor); return err; } static const struct file_operations drm_stub_fops = { .owner = THIS_MODULE, .open = drm_stub_open, .llseek = noop_llseek, }; static void drm_core_exit(void) { drm_privacy_screen_lookup_exit(); accel_core_exit(); unregister_chrdev(DRM_MAJOR, "drm"); debugfs_remove(drm_debugfs_root); drm_sysfs_destroy(); idr_destroy(&drm_minors_idr); drm_connector_ida_destroy(); } static int __init drm_core_init(void) { int ret; drm_connector_ida_init(); idr_init(&drm_minors_idr); drm_memcpy_init_early(); ret = drm_sysfs_init(); if (ret < 0) { DRM_ERROR("Cannot create DRM class: %d\n", ret); goto error; } drm_debugfs_root = debugfs_create_dir("dri", NULL); ret = register_chrdev(DRM_MAJOR, "drm", &drm_stub_fops); if (ret < 0) goto error; ret = accel_core_init(); if (ret < 0) goto error; drm_privacy_screen_lookup_init(); drm_core_init_complete = true; DRM_DEBUG("Initialized\n"); return 0; error: drm_core_exit(); return ret; } module_init(drm_core_init); module_exit(drm_core_exit); |
| 2 59 59 61 120 119 74 120 1 88 1 1 38 38 89 88 88 1 63 2 9 18 55 88 84 50 38 6 15 20 54 47 42 33 42 49 45 4 3 52 7 4 43 41 45 52 51 54 54 5 37 12 51 2 43 44 28 42 43 27 32 28 15 32 12 43 2 46 13 8 5 13 58 1 56 57 56 55 56 55 52 5 27 17 41 1 50 39 17 56 53 1 4 59 58 58 75 73 70 14 3 84 81 32 5 40 51 15 36 50 85 2 5 1 81 3 87 3 71 70 75 3 2 1 1 1 1 46 1 43 40 1 1 47 24 1 20 2 41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 | // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. */ if (rt_task(current)) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < current->timer_slack_ns) return current->timer_slack_ns; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!pwq->triggered) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { struct fd f; if (i >= n) break; if (!(bit & all_bits)) continue; mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, busy_flag); mask = vfs_poll(f.file, wait); fdput(f); } if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[]; }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask = 0, filter; struct fd f; if (fd < 0) goto out; mask = EPOLLNVAL; f = fdget(fd); if (!f.file) goto out; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(f.file, pwait); if (mask & busy_flag) *can_busy_poll = true; mask &= filter; /* Mask out unneeded events. */ fdput(f); out: /* ... and so does ->revents */ pollfd->revents = mangle_poll(mask); return mask; } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt, &can_busy_loop, busy_flag)) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = ¤t->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif |
| 3461 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 | /* * linux/include/linux/console.h * * Copyright (C) 1993 Hamish Macdonald * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * * Changed: * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX */ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 #include <linux/atomic.h> #include <linux/bits.h> #include <linux/rculist.h> #include <linux/types.h> #include <linux/vesa.h> struct vc_data; struct console_font_op; struct console_font; struct module; struct tty_struct; struct notifier_block; enum con_scroll { SM_UP, SM_DOWN, }; enum vc_intensity; /** * struct consw - callbacks for consoles * * @owner: the module to get references of when this console is used * @con_startup: set up the console and return its name (like VGA, EGA, ...) * @con_init: initialize the console on @vc. @init is true for the very first * call on this @vc. * @con_deinit: deinitialize the console from @vc. * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. * (optional -- @con_putcs would be called instead) * @con_putcs: emit @count characters with attributes @s to [@x, @y] on @vc. * @con_cursor: enable/disable cursor depending on @enable * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. * @con_switch: notifier about the console switch; it is supposed to return * true if a redraw is needed. * @con_blank: blank/unblank the console. The target mode is passed in @blank. * @mode_switch is set if changing from/to text/graphics. The hook * is supposed to return true if a redraw is needed. * @con_font_set: set console @vc font to @font with height @vpitch. @flags can * be %KD_FONT_FLAG_DONT_RECALC. (optional) * @con_font_get: fetch the current font on @vc of height @vpitch into @font. * (optional) * @con_font_default: set default font on @vc. @name can be %NULL or font name * to search for. @font can be filled back. (optional) * @con_resize: resize the @vc console to @width x @height. @from_user is true * when this change comes from the user space. * @con_set_palette: sets the palette of the console @vc to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not * provided or returns false, the origin is set to * @vc->vc_screenbuf. (optional) * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g. * upon entering graphics. (optional) * @con_build_attr: build attributes based on @color, @intensity and other * parameters. The result is used for both normal and erase * characters. (optional) * @con_invert_region: invert a region of length @count on @vc starting at @p. * (optional) * @con_debug_enter: prepare the console for the debugger. This includes, but * is not limited to, unblanking the console, loading an * appropriate palette, and allowing debugger generated output. * (optional) * @con_debug_leave: restore the console to its pre-debug state as closely as * possible. (optional) */ struct consw { struct module *owner; const char *(*con_startup)(void); void (*con_init)(struct vc_data *vc, bool init); void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, unsigned int y, unsigned int x, unsigned int count); void (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y, unsigned int x); void (*con_putcs)(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos); void (*con_cursor)(struct vc_data *vc, bool enable); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); bool (*con_switch)(struct vc_data *vc); bool (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, bool mode_switch); int (*con_font_set)(struct vc_data *vc, const struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, struct console_font *font, const char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user); void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); bool (*con_set_origin)(struct vc_data *vc); void (*con_save_screen)(struct vc_data *vc); u8 (*con_build_attr)(struct vc_data *vc, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); void (*con_debug_enter)(struct vc_data *vc); void (*con_debug_leave)(struct vc_data *vc); }; extern const struct consw *conswitchp; extern const struct consw dummy_con; /* dummy console buffer */ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ struct screen_info; #ifdef CONFIG_VGA_CONSOLE void vgacon_register_screen(struct screen_info *si); #else static inline void vgacon_register_screen(struct screen_info *si) { } #endif int con_is_bound(const struct consw *csw); int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_VT void con_debug_enter(struct vc_data *vc); void con_debug_leave(void); #else static inline void con_debug_enter(struct vc_data *vc) { } static inline void con_debug_leave(void) { } #endif /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) */ /** * enum cons_flags - General console flags * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate * output of messages that were already shown by boot * consoles or read by userspace via syslog() syscall. * @CON_CONSDEV: Indicates that the console driver is backing * /dev/console. * @CON_ENABLED: Indicates if a console is allowed to print records. If * false, the console also will not advance to later * records. * @CON_BOOT: Marks the console driver as early console driver which * is used during boot before the real driver becomes * available. It will be automatically unregistered * when the real console driver is registered unless * "keep_bootcon" parameter is used. * @CON_ANYTIME: A misnomed historical flag which tells the core code * that the legacy @console::write callback can be invoked * on a CPU which is marked OFFLINE. That is misleading as * it suggests that there is no contextual limit for * invoking the callback. The original motivation was * readiness of the per-CPU areas. * @CON_BRL: Indicates a braille device which is exempt from * receiving the printk spam for obvious reasons. * @CON_EXTENDED: The console supports the extended output format of * /dev/kmesg which requires a larger output buffer. * @CON_SUSPENDED: Indicates if a console is suspended. If true, the * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. */ enum cons_flags { CON_PRINTBUFFER = BIT(0), CON_CONSDEV = BIT(1), CON_ENABLED = BIT(2), CON_BOOT = BIT(3), CON_ANYTIME = BIT(4), CON_BRL = BIT(5), CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), }; /** * struct nbcon_state - console state for nbcon consoles * @atom: Compound of the state fields for atomic operations * * @req_prio: The priority of a handover request * @prio: The priority of the current owner * @unsafe: Console is busy in a non takeover region * @unsafe_takeover: A hostile takeover in an unsafe state happened in the * past. The console cannot be safe until re-initialized. * @cpu: The CPU on which the owner runs * * To be used for reading and preparing of the value stored in the nbcon * state variable @console::nbcon_state. * * The @prio and @req_prio fields are particularly important to allow * spin-waiting to timeout and give up without the risk of a waiter being * assigned the lock after giving up. */ struct nbcon_state { union { unsigned int atom; struct { unsigned int prio : 2; unsigned int req_prio : 2; unsigned int unsafe : 1; unsigned int unsafe_takeover : 1; unsigned int cpu : 24; }; }; }; /* * The nbcon_state struct is used to easily create and interpret values that * are stored in the @console::nbcon_state variable. Ensure this struct stays * within the size boundaries of the atomic variable's underlying type in * order to avoid any accidental truncation. */ static_assert(sizeof(struct nbcon_state) <= sizeof(int)); /** * enum nbcon_prio - console owner priority for nbcon consoles * @NBCON_PRIO_NONE: Unused * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) * @NBCON_PRIO_PANIC: Panic output * @NBCON_PRIO_MAX: The number of priority levels * * A higher priority context can takeover the console when it is * in the safe state. The final attempt to flush consoles in panic() * can be allowed to do so even in an unsafe state (Hope and pray). */ enum nbcon_prio { NBCON_PRIO_NONE = 0, NBCON_PRIO_NORMAL, NBCON_PRIO_EMERGENCY, NBCON_PRIO_PANIC, NBCON_PRIO_MAX, }; struct console; struct printk_buffers; /** * struct nbcon_context - Context for console acquire/release * @console: The associated console * @spinwait_max_us: Limit for spin-wait acquire * @prio: Priority of the context * @allow_unsafe_takeover: Allow performing takeover even if unsafe. Can * be used only with NBCON_PRIO_PANIC @prio. It * might cause a system freeze when the console * is used later. * @backlog: Ringbuffer has pending records * @pbufs: Pointer to the text buffer for this context * @seq: The sequence number to print for this context */ struct nbcon_context { /* members set by caller */ struct console *console; unsigned int spinwait_max_us; enum nbcon_prio prio; unsigned int allow_unsafe_takeover : 1; /* members set by emit */ unsigned int backlog : 1; /* members set by acquire */ struct printk_buffers *pbufs; u64 seq; }; /** * struct nbcon_write_context - Context handed to the nbcon write callbacks * @ctxt: The core console context * @outbuf: Pointer to the text buffer for output * @len: Length to write * @unsafe_takeover: If a hostile takeover in an unsafe state has occurred */ struct nbcon_write_context { struct nbcon_context __private ctxt; char *outbuf; unsigned int len; bool unsafe_takeover; }; /** * struct console - The console descriptor structure * @name: The name of the console driver * @write: Write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) * @setup: Callback for initializing the console (Optional) * @exit: Callback for teardown of the console (Optional) * @match: Callback for matching a console (Optional) * @flags: Console flags. See enum cons_flags * @index: Console index, e.g. port number * @cflag: TTY control mode flags * @ispeed: TTY input speed * @ospeed: TTY output speed * @seq: Sequence number of the next ringbuffer record to print * @dropped: Number of unreported dropped ringbuffer records * @data: Driver private data * @node: hlist node for the console list * * @write_atomic: Write callback for atomic context * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print * @pbufs: Pointer to nbcon private buffer */ struct console { char name[16]; void (*write)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *co, char *s, unsigned int count); struct tty_driver *(*device)(struct console *co, int *index); void (*unblank)(void); int (*setup)(struct console *co, char *options); int (*exit)(struct console *co); int (*match)(struct console *co, char *name, int idx, char *options); short flags; short index; int cflag; uint ispeed; uint ospeed; u64 seq; unsigned long dropped; void *data; struct hlist_node node; /* nbcon console specific members */ bool (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct printk_buffers *pbufs; }; #ifdef CONFIG_LOCKDEP extern void lockdep_assert_console_list_lock_held(void); #else static inline void lockdep_assert_console_list_lock_held(void) { } #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC extern bool console_srcu_read_lock_is_held(void); #else static inline bool console_srcu_read_lock_is_held(void) { return 1; } #endif extern int console_srcu_read_lock(void); extern void console_srcu_read_unlock(int cookie); extern void console_list_lock(void) __acquires(console_mutex); extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** * console_srcu_read_flags - Locklessly read the console flags * @con: struct console pointer of console to read flags from * * This function provides the necessary READ_ONCE() and data_race() * notation for locklessly reading the console flags. The READ_ONCE() * in this function matches the WRITE_ONCE() when @flags are modified * for registered consoles with console_srcu_write_flags(). * * Only use this function to read console flags when locklessly * iterating the console list via srcu. * * Context: Any context. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* * Locklessly reading console->flags provides a consistent * read value because there is at most one CPU modifying * console->flags and that CPU is using only read-modify-write * operations to do so. */ return data_race(READ_ONCE(con->flags)); } /** * console_srcu_write_flags - Write flags for a registered console * @con: struct console pointer of console to write flags to * @flags: new flags value to write * * Only use this function to write flags for registered consoles. It * requires holding the console_list_lock. * * Context: Any context. */ static inline void console_srcu_write_flags(struct console *con, short flags) { lockdep_assert_console_list_lock_held(); /* This matches the READ_ONCE() in console_srcu_read_flags(). */ WRITE_ONCE(con->flags, flags); } /* Variant of console_is_registered() when the console_list_lock is held. */ static inline bool console_is_registered_locked(const struct console *con) { lockdep_assert_console_list_lock_held(); return !hlist_unhashed(&con->node); } /* * console_is_registered - Check if the console is registered * @con: struct console pointer of console to check * * Context: Process context. May sleep while acquiring console list lock. * Return: true if the console is in the console list, otherwise false. * * If false is returned for a console that was previously registered, it * can be assumed that the console's unregistration is fully completed, * including the exit() callback after console list removal. */ static inline bool console_is_registered(const struct console *con) { bool ret; console_list_lock(); ret = console_is_registered_locked(con); console_list_unlock(); return ret; } /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * Although SRCU guarantees the console list will be consistent, the * struct console fields may be updated by other CPUs while iterating. * * Requires console_srcu_read_lock to be held. Can be invoked from * any context. */ #define for_each_console_srcu(con) \ hlist_for_each_entry_srcu(con, &console_list, node, \ console_srcu_read_lock_is_held()) /** * for_each_console() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * The console list and the &console.flags are immutable while iterating. * * Requires console_list_lock to be held. */ #define for_each_console(con) \ lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) #ifdef CONFIG_PRINTK extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); #else static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } #endif extern int console_set_on_cmdline; extern struct console *early_console; enum con_flush_mode { CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; extern int add_preferred_console(const char *name, const short idx, char *options); extern void console_force_preferred_locked(struct console *con); extern void register_console(struct console *); extern int unregister_console(struct console *); extern void console_lock(void); extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_stop(struct console *); extern void console_start(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, char *console_options, char *braille_options); extern int braille_unregister_console(struct console *); #ifdef CONFIG_TTY extern void console_sysfs_notify(void); #else static inline void console_sysfs_notify(void) { } #endif extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ extern void suspend_console(void); extern void resume_console(void); int mda_console_init(void); void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ #define WARN_CONSOLE_UNLOCKED() \ WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ !is_console_locked() && !oops_in_progress) /* * Increment ignore_console_lock_warning if you need to quiet * WARN_CONSOLE_UNLOCKED() for debugging purposes. */ extern atomic_t ignore_console_lock_warning; extern void console_init(void); /* For deferred console takeover */ void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); #endif /* _LINUX_CONSOLE_H */ |
| 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_STACK_H #define _LINUX_SCHED_TASK_STACK_H /* * task->stack (kernel stack) handling interfaces: */ #include <linux/sched.h> #include <linux/magic.h> #include <linux/refcount.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * When accessing the stack of a non-current task that might exit, use * try_get_task_stack() instead. task_stack_page will return a pointer * that could get freed out from under you. */ static __always_inline void *task_stack_page(const struct task_struct *task) { return task->stack; } #define setup_thread_stack(new,old) do { } while(0) static __always_inline unsigned long *end_of_stack(const struct task_struct *task) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; #else return task->stack; #endif } #elif !defined(__HAVE_THREAD_FUNCTIONS) #define task_stack_page(task) ((void *)(task)->stack) static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; } /* * Return the address of the last usable long on the stack. * * When the stack grows down, this is just above the thread * info struct. Going any lower will corrupt the threadinfo. * * When the stack grows up, this is the highest address. * Beyond that position, we corrupt data on the next page. */ static inline unsigned long *end_of_stack(struct task_struct *p) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1; #else return (unsigned long *)(task_thread_info(p) + 1); #endif } #endif #ifdef CONFIG_THREAD_INFO_IN_TASK static inline void *try_get_task_stack(struct task_struct *tsk) { return refcount_inc_not_zero(&tsk->stack_refcount) ? task_stack_page(tsk) : NULL; } extern void put_task_stack(struct task_struct *tsk); #else static inline void *try_get_task_stack(struct task_struct *tsk) { return task_stack_page(tsk); } static inline void put_task_stack(struct task_struct *tsk) {} #endif void exit_task_stack_account(struct task_struct *tsk); #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE static inline unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ # ifdef CONFIG_STACK_GROWSUP n--; # else n++; # endif } while (!*n); # ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; # else return (unsigned long)n - (unsigned long)end_of_stack(p); # endif } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); #ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: * Some APM bios versions misalign the stack */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } #endif #endif /* _LINUX_SCHED_TASK_STACK_H */ |
| 1179 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HARDIRQ_H #define _ASM_X86_HARDIRQ_H #include <linux/threads.h> #include <asm/current.h> typedef struct { #if IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif unsigned int __nmi_count; /* arch dependent */ #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; unsigned int icr_read_retry_count; #endif #if IS_ENABLED(CONFIG_KVM) unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; unsigned int apic_irq_work_irqs; #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; #endif unsigned int irq_tlb_count; #ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; #endif #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif #ifdef CONFIG_X86_MCE_AMD unsigned int irq_deferred_error_count; #endif #ifdef CONFIG_X86_HV_CALLBACK_VECTOR unsigned int irq_hv_callback_count; #endif #if IS_ENABLED(CONFIG_HYPERV) unsigned int irq_hv_reenlightenment_count; unsigned int hyperv_stimer0_count; #endif #ifdef CONFIG_X86_POSTED_MSI unsigned int posted_msi_notification_count; #endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); #ifdef CONFIG_X86_POSTED_MSI DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); #endif #define __ARCH_IRQ_STAT #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) extern void ack_bad_irq(unsigned int irq); extern u64 arch_irq_stat_cpu(unsigned int cpu); #define arch_irq_stat_cpu arch_irq_stat_cpu extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat #define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) static inline void kvm_set_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); } static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) { return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } #else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } #endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ #endif /* _ASM_X86_HARDIRQ_H */ |
| 21 16 19 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | /* * linux/fs/nls/nls_cp874.c * * Charset cp874 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2026, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 0x90*/ 0x0000, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 0xa0*/ 0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07, 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, /* 0xb0*/ 0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17, 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, /* 0xc0*/ 0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27, 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, /* 0xd0*/ 0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37, 0x0e38, 0x0e39, 0x0e3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0e3f, /* 0xe0*/ 0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47, 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, /* 0xf0*/ 0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57, 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, 0x0000, 0x0000, 0x0000, 0x0000, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char page0e[256] = { 0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x08-0x0f */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0x38-0x3f */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */ 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x91, 0x92, 0x00, 0x00, 0x93, 0x94, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page0e, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp874", .alias = "tis-620", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp874(void) { return register_nls(&table); } static void __exit exit_nls_cp874(void) { unregister_nls(&table); } module_init(init_nls_cp874) module_exit(exit_nls_cp874) MODULE_DESCRIPTION("NLS Thai charset (CP874, TIS-620)"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(tis-620); |
| 4 5 4 2 1 4 4 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 | /* * videobuf2-core.h - Video Buffer 2 Core Framework * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #ifndef _MEDIA_VIDEOBUF2_CORE_H #define _MEDIA_VIDEOBUF2_CORE_H #include <linux/mm_types.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/dma-buf.h> #include <linux/bitops.h> #include <media/media-request.h> #include <media/frame_vector.h> #define VB2_MAX_FRAME (32) #define VB2_MAX_PLANES (8) /** * enum vb2_memory - type of memory model used to make the buffers visible * on userspace. * * @VB2_MEMORY_UNKNOWN: Buffer status is unknown or it is not used yet on * userspace. * @VB2_MEMORY_MMAP: The buffers are allocated by the Kernel and it is * memory mapped via mmap() ioctl. This model is * also used when the user is using the buffers via * read() or write() system calls. * @VB2_MEMORY_USERPTR: The buffers was allocated in userspace and it is * memory mapped via mmap() ioctl. * @VB2_MEMORY_DMABUF: The buffers are passed to userspace via DMA buffer. */ enum vb2_memory { VB2_MEMORY_UNKNOWN = 0, VB2_MEMORY_MMAP = 1, VB2_MEMORY_USERPTR = 2, VB2_MEMORY_DMABUF = 4, }; struct vb2_fileio_data; struct vb2_threadio_data; struct vb2_buffer; /** * struct vb2_mem_ops - memory handling/memory allocator operations. * @alloc: allocate video memory and, optionally, allocator private data, * return ERR_PTR() on failure or a pointer to allocator private, * per-buffer data on success; the returned private structure * will then be passed as @buf_priv argument to other ops in this * structure. The size argument to this function shall be * *page aligned*. * @put: inform the allocator that the buffer will no longer be used; * usually will result in the allocator freeing the buffer (if * no other users of this buffer are present); the @buf_priv * argument is the allocator private per-buffer structure * previously returned from the alloc callback. * @get_dmabuf: acquire userspace memory for a hardware operation; used for * DMABUF memory types. * @get_userptr: acquire userspace memory for a hardware operation; used for * USERPTR memory types; vaddr is the address passed to the * videobuf2 layer when queuing a video buffer of USERPTR type; * should return an allocator private per-buffer structure * associated with the buffer on success, ERR_PTR() on failure; * the returned private structure will then be passed as @buf_priv * argument to other ops in this structure. * @put_userptr: inform the allocator that a USERPTR buffer will no longer * be used. * @prepare: called every time the buffer is passed from userspace to the * driver, useful for cache synchronisation, optional. * @finish: called every time the buffer is passed back from the driver * to the userspace, also optional. * @attach_dmabuf: attach a shared &struct dma_buf for a hardware operation; * used for DMABUF memory types; dev is the alloc device * dbuf is the shared dma_buf; returns ERR_PTR() on failure; * allocator private per-buffer structure on success; * this needs to be used for further accesses to the buffer. * @detach_dmabuf: inform the exporter of the buffer that the current DMABUF * buffer is no longer used; the @buf_priv argument is the * allocator private per-buffer structure previously returned * from the attach_dmabuf callback. * @map_dmabuf: request for access to the dmabuf from allocator; the allocator * of dmabuf is informed that this driver is going to use the * dmabuf. * @unmap_dmabuf: releases access control to the dmabuf - allocator is notified * that this driver is done using the dmabuf for now. * @vaddr: return a kernel virtual address to a given memory buffer * associated with the passed private structure or NULL if no * such mapping exists. * @cookie: return allocator specific cookie for a given memory buffer * associated with the passed private structure or NULL if not * available. * @num_users: return the current number of users of a memory buffer; * return 1 if the videobuf2 layer (or actually the driver using * it) is the only user. * @mmap: setup a userspace mapping for a given memory buffer under * the provided virtual memory region. * * Those operations are used by the videobuf2 core to implement the memory * handling/memory allocators for each type of supported streaming I/O method. * * .. note:: * #) Required ops for USERPTR types: get_userptr, put_userptr. * * #) Required ops for MMAP types: alloc, put, num_users, mmap. * * #) Required ops for read/write access types: alloc, put, num_users, vaddr. * * #) Required ops for DMABUF types: attach_dmabuf, detach_dmabuf, * map_dmabuf, unmap_dmabuf. */ struct vb2_mem_ops { void *(*alloc)(struct vb2_buffer *vb, struct device *dev, unsigned long size); void (*put)(void *buf_priv); struct dma_buf *(*get_dmabuf)(struct vb2_buffer *vb, void *buf_priv, unsigned long flags); void *(*get_userptr)(struct vb2_buffer *vb, struct device *dev, unsigned long vaddr, unsigned long size); void (*put_userptr)(void *buf_priv); void (*prepare)(void *buf_priv); void (*finish)(void *buf_priv); void *(*attach_dmabuf)(struct vb2_buffer *vb, struct device *dev, struct dma_buf *dbuf, unsigned long size); void (*detach_dmabuf)(void *buf_priv); int (*map_dmabuf)(void *buf_priv); void (*unmap_dmabuf)(void *buf_priv); void *(*vaddr)(struct vb2_buffer *vb, void *buf_priv); void *(*cookie)(struct vb2_buffer *vb, void *buf_priv); unsigned int (*num_users)(void *buf_priv); int (*mmap)(void *buf_priv, struct vm_area_struct *vma); }; /** * struct vb2_plane - plane information. * @mem_priv: private data with this plane. * @dbuf: dma_buf - shared buffer object. * @dbuf_mapped: flag to show whether dbuf is mapped or not * @bytesused: number of bytes occupied by data in the plane (payload). * @length: size of this plane (NOT the payload) in bytes. The maximum * valid size is MAX_UINT - PAGE_SIZE. * @min_length: minimum required size of this plane (NOT the payload) in bytes. * @length is always greater or equal to @min_length, and like * @length, it is limited to MAX_UINT - PAGE_SIZE. * @m: Union with memtype-specific data. * @m.offset: when memory in the associated struct vb2_buffer is * %VB2_MEMORY_MMAP, equals the offset from the start of * the device memory for this plane (or is a "cookie" that * should be passed to mmap() called on the video node). * @m.userptr: when memory is %VB2_MEMORY_USERPTR, a userspace pointer * pointing to this plane. * @m.fd: when memory is %VB2_MEMORY_DMABUF, a userspace file * descriptor associated with this plane. * @data_offset: offset in the plane to the start of data; usually 0, * unless there is a header in front of the data. * * Should contain enough information to be able to cover all the fields * of &struct v4l2_plane at videodev2.h. */ struct vb2_plane { void *mem_priv; struct dma_buf *dbuf; unsigned int dbuf_mapped; unsigned int bytesused; unsigned int length; unsigned int min_length; union { unsigned int offset; unsigned long userptr; int fd; } m; unsigned int data_offset; }; /** * enum vb2_io_modes - queue access methods. * @VB2_MMAP: driver supports MMAP with streaming API. * @VB2_USERPTR: driver supports USERPTR with streaming API. * @VB2_READ: driver supports read() style access. * @VB2_WRITE: driver supports write() style access. * @VB2_DMABUF: driver supports DMABUF with streaming API. */ enum vb2_io_modes { VB2_MMAP = BIT(0), VB2_USERPTR = BIT(1), VB2_READ = BIT(2), VB2_WRITE = BIT(3), VB2_DMABUF = BIT(4), }; /** * enum vb2_buffer_state - current video buffer state. * @VB2_BUF_STATE_DEQUEUED: buffer under userspace control. * @VB2_BUF_STATE_IN_REQUEST: buffer is queued in media request. * @VB2_BUF_STATE_PREPARING: buffer is being prepared in videobuf2. * @VB2_BUF_STATE_QUEUED: buffer queued in videobuf2, but not in driver. * @VB2_BUF_STATE_ACTIVE: buffer queued in driver and possibly used * in a hardware operation. * @VB2_BUF_STATE_DONE: buffer returned from driver to videobuf2, but * not yet dequeued to userspace. * @VB2_BUF_STATE_ERROR: same as above, but the operation on the buffer * has ended with an error, which will be reported * to the userspace when it is dequeued. */ enum vb2_buffer_state { VB2_BUF_STATE_DEQUEUED, VB2_BUF_STATE_IN_REQUEST, VB2_BUF_STATE_PREPARING, VB2_BUF_STATE_QUEUED, VB2_BUF_STATE_ACTIVE, VB2_BUF_STATE_DONE, VB2_BUF_STATE_ERROR, }; struct vb2_queue; /** * struct vb2_buffer - represents a video buffer. * @vb2_queue: pointer to &struct vb2_queue with the queue to * which this driver belongs. * @index: id number of the buffer. * @type: buffer type. * @memory: the method, in which the actual data is passed. * @num_planes: number of planes in the buffer * on an internal driver queue. * @timestamp: frame timestamp in ns. * @request: the request this buffer is associated with. * @req_obj: used to bind this buffer to a request. This * request object has a refcount. */ struct vb2_buffer { struct vb2_queue *vb2_queue; unsigned int index; unsigned int type; unsigned int memory; unsigned int num_planes; u64 timestamp; struct media_request *request; struct media_request_object req_obj; /* private: internal use only * * state: current buffer state; do not change * synced: this buffer has been synced for DMA, i.e. the * 'prepare' memop was called. It is cleared again * after the 'finish' memop is called. * prepared: this buffer has been prepared, i.e. the * buf_prepare op was called. It is cleared again * after the 'buf_finish' op is called. * copied_timestamp: the timestamp of this capture buffer was copied * from an output buffer. * skip_cache_sync_on_prepare: when set buffer's ->prepare() function * skips cache sync/invalidation. * skip_cache_sync_on_finish: when set buffer's ->finish() function * skips cache sync/invalidation. * planes: per-plane information; do not change * queued_entry: entry on the queued buffers list, which holds * all buffers queued from userspace * done_entry: entry on the list that stores all buffers ready * to be dequeued to userspace */ enum vb2_buffer_state state; unsigned int synced:1; unsigned int prepared:1; unsigned int copied_timestamp:1; unsigned int skip_cache_sync_on_prepare:1; unsigned int skip_cache_sync_on_finish:1; struct vb2_plane planes[VB2_MAX_PLANES]; struct list_head queued_entry; struct list_head done_entry; #ifdef CONFIG_VIDEO_ADV_DEBUG /* * Counters for how often these buffer-related ops are * called. Used to check for unbalanced ops. */ u32 cnt_mem_alloc; u32 cnt_mem_put; u32 cnt_mem_get_dmabuf; u32 cnt_mem_get_userptr; u32 cnt_mem_put_userptr; u32 cnt_mem_prepare; u32 cnt_mem_finish; u32 cnt_mem_attach_dmabuf; u32 cnt_mem_detach_dmabuf; u32 cnt_mem_map_dmabuf; u32 cnt_mem_unmap_dmabuf; u32 cnt_mem_vaddr; u32 cnt_mem_cookie; u32 cnt_mem_num_users; u32 cnt_mem_mmap; u32 cnt_buf_out_validate; u32 cnt_buf_init; u32 cnt_buf_prepare; u32 cnt_buf_finish; u32 cnt_buf_cleanup; u32 cnt_buf_queue; u32 cnt_buf_request_complete; /* This counts the number of calls to vb2_buffer_done() */ u32 cnt_buf_done; #endif }; /** * struct vb2_ops - driver-specific callbacks. * * These operations are not called from interrupt context except where * mentioned specifically. * * @queue_setup: called from VIDIOC_REQBUFS() and VIDIOC_CREATE_BUFS() * handlers before memory allocation. It can be called * twice: if the original number of requested buffers * could not be allocated, then it will be called a * second time with the actually allocated number of * buffers to verify if that is OK. * The driver should return the required number of buffers * in \*num_buffers, the required number of planes per * buffer in \*num_planes, the size of each plane should be * set in the sizes\[\] array and optional per-plane * allocator specific device in the alloc_devs\[\] array. * When called from VIDIOC_REQBUFS(), \*num_planes == 0, * the driver has to use the currently configured format to * determine the plane sizes and \*num_buffers is the total * number of buffers that are being allocated. When called * from VIDIOC_CREATE_BUFS(), \*num_planes != 0 and it * describes the requested number of planes and sizes\[\] * contains the requested plane sizes. In this case * \*num_buffers are being allocated additionally to * the buffers already allocated. If either \*num_planes * or the requested sizes are invalid callback must return %-EINVAL. * @wait_prepare: release any locks taken while calling vb2 functions; * it is called before an ioctl needs to wait for a new * buffer to arrive; required to avoid a deadlock in * blocking access type. * @wait_finish: reacquire all locks released in the previous callback; * required to continue operation after sleeping while * waiting for a new buffer to arrive. * @buf_out_validate: called when the output buffer is prepared or queued * to a request; drivers can use this to validate * userspace-provided information; this is required only * for OUTPUT queues. * @buf_init: called once after allocating a buffer (in MMAP case) * or after acquiring a new USERPTR buffer; drivers may * perform additional buffer-related initialization; * initialization failure (return != 0) will prevent * queue setup from completing successfully; optional. * @buf_prepare: called every time the buffer is queued from userspace * and from the VIDIOC_PREPARE_BUF() ioctl; drivers may * perform any initialization required before each * hardware operation in this callback; drivers can * access/modify the buffer here as it is still synced for * the CPU; drivers that support VIDIOC_CREATE_BUFS() must * also validate the buffer size; if an error is returned, * the buffer will not be queued in driver; optional. * @buf_finish: called before every dequeue of the buffer back to * userspace; the buffer is synced for the CPU, so drivers * can access/modify the buffer contents; drivers may * perform any operations required before userspace * accesses the buffer; optional. The buffer state can be * one of the following: %DONE and %ERROR occur while * streaming is in progress, and the %PREPARED state occurs * when the queue has been canceled and all pending * buffers are being returned to their default %DEQUEUED * state. Typically you only have to do something if the * state is %VB2_BUF_STATE_DONE, since in all other cases * the buffer contents will be ignored anyway. * @buf_cleanup: called once before the buffer is freed; drivers may * perform any additional cleanup; optional. * @prepare_streaming: called once to prepare for 'streaming' state; this is * where validation can be done to verify everything is * okay and streaming resources can be claimed. It is * called when the VIDIOC_STREAMON ioctl is called. The * actual streaming starts when @start_streaming is called. * Optional. * @start_streaming: called once to enter 'streaming' state; the driver may * receive buffers with @buf_queue callback * before @start_streaming is called; the driver gets the * number of already queued buffers in count parameter; * driver can return an error if hardware fails, in that * case all buffers that have been already given by * the @buf_queue callback are to be returned by the driver * by calling vb2_buffer_done() with %VB2_BUF_STATE_QUEUED. * If you need a minimum number of buffers before you can * start streaming, then set * &vb2_queue->min_queued_buffers. If that is non-zero * then @start_streaming won't be called until at least * that many buffers have been queued up by userspace. * @stop_streaming: called when 'streaming' state must be disabled; driver * should stop any DMA transactions or wait until they * finish and give back all buffers it got from &buf_queue * callback by calling vb2_buffer_done() with either * %VB2_BUF_STATE_DONE or %VB2_BUF_STATE_ERROR; may use * vb2_wait_for_all_buffers() function * @unprepare_streaming:called as counterpart to @prepare_streaming; any claimed * streaming resources can be released here. It is * called when the VIDIOC_STREAMOFF ioctls is called or * when the streaming filehandle is closed. Optional. * @buf_queue: passes buffer vb to the driver; driver may start * hardware operation on this buffer; driver should give * the buffer back by calling vb2_buffer_done() function; * it is always called after calling VIDIOC_STREAMON() * ioctl; might be called before @start_streaming callback * if user pre-queued buffers before calling * VIDIOC_STREAMON(). * @buf_request_complete: a buffer that was never queued to the driver but is * associated with a queued request was canceled. * The driver will have to mark associated objects in the * request as completed; required if requests are * supported. */ struct vb2_ops { int (*queue_setup)(struct vb2_queue *q, unsigned int *num_buffers, unsigned int *num_planes, unsigned int sizes[], struct device *alloc_devs[]); void (*wait_prepare)(struct vb2_queue *q); void (*wait_finish)(struct vb2_queue *q); int (*buf_out_validate)(struct vb2_buffer *vb); int (*buf_init)(struct vb2_buffer *vb); int (*buf_prepare)(struct vb2_buffer *vb); void (*buf_finish)(struct vb2_buffer *vb); void (*buf_cleanup)(struct vb2_buffer *vb); int (*prepare_streaming)(struct vb2_queue *q); int (*start_streaming)(struct vb2_queue *q, unsigned int count); void (*stop_streaming)(struct vb2_queue *q); void (*unprepare_streaming)(struct vb2_queue *q); void (*buf_queue)(struct vb2_buffer *vb); void (*buf_request_complete)(struct vb2_buffer *vb); }; /** * struct vb2_buf_ops - driver-specific callbacks. * * @verify_planes_array: Verify that a given user space structure contains * enough planes for the buffer. This is called * for each dequeued buffer. * @init_buffer: given a &vb2_buffer initialize the extra data after * struct vb2_buffer. * For V4L2 this is a &struct vb2_v4l2_buffer. * @fill_user_buffer: given a &vb2_buffer fill in the userspace structure. * For V4L2 this is a &struct v4l2_buffer. * @fill_vb2_buffer: given a userspace structure, fill in the &vb2_buffer. * If the userspace structure is invalid, then this op * will return an error. * @copy_timestamp: copy the timestamp from a userspace structure to * the &struct vb2_buffer. */ struct vb2_buf_ops { int (*verify_planes_array)(struct vb2_buffer *vb, const void *pb); void (*init_buffer)(struct vb2_buffer *vb); void (*fill_user_buffer)(struct vb2_buffer *vb, void *pb); int (*fill_vb2_buffer)(struct vb2_buffer *vb, struct vb2_plane *planes); void (*copy_timestamp)(struct vb2_buffer *vb, const void *pb); }; /** * struct vb2_queue - a videobuf2 queue. * * @type: private buffer type whose content is defined by the vb2-core * caller. For example, for V4L2, it should match * the types defined on &enum v4l2_buf_type. * @io_modes: supported io methods (see &enum vb2_io_modes). * @dev: device to use for the default allocation context if the driver * doesn't fill in the @alloc_devs array. * @dma_attrs: DMA attributes to use for the DMA. * @bidirectional: when this flag is set the DMA direction for the buffers of * this queue will be overridden with %DMA_BIDIRECTIONAL direction. * This is useful in cases where the hardware (firmware) writes to * a buffer which is mapped as read (%DMA_TO_DEVICE), or reads from * buffer which is mapped for write (%DMA_FROM_DEVICE) in order * to satisfy some internal hardware restrictions or adds a padding * needed by the processing algorithm. In case the DMA mapping is * not bidirectional but the hardware (firmware) trying to access * the buffer (in the opposite direction) this could lead to an * IOMMU protection faults. * @fileio_read_once: report EOF after reading the first buffer * @fileio_write_immediately: queue buffer after each write() call * @allow_zero_bytesused: allow bytesused == 0 to be passed to the driver * @quirk_poll_must_check_waiting_for_buffers: Return %EPOLLERR at poll when QBUF * has not been called. This is a vb1 idiom that has been adopted * also by vb2. * @supports_requests: this queue supports the Request API. * @requires_requests: this queue requires the Request API. If this is set to 1, * then supports_requests must be set to 1 as well. * @uses_qbuf: qbuf was used directly for this queue. Set to 1 the first * time this is called. Set to 0 when the queue is canceled. * If this is 1, then you cannot queue buffers from a request. * @uses_requests: requests are used for this queue. Set to 1 the first time * a request is queued. Set to 0 when the queue is canceled. * If this is 1, then you cannot queue buffers directly. * @allow_cache_hints: when set user-space can pass cache management hints in * order to skip cache flush/invalidation on ->prepare() or/and * ->finish(). * @non_coherent_mem: when set queue will attempt to allocate buffers using * non-coherent memory. * @lock: pointer to a mutex that protects the &struct vb2_queue. The * driver can set this to a mutex to let the v4l2 core serialize * the queuing ioctls. If the driver wants to handle locking * itself, then this should be set to NULL. This lock is not used * by the videobuf2 core API. * @owner: The filehandle that 'owns' the buffers, i.e. the filehandle * that called reqbufs, create_buffers or started fileio. * This field is not used by the videobuf2 core API, but it allows * drivers to easily associate an owner filehandle with the queue. * @ops: driver-specific callbacks * @mem_ops: memory allocator specific callbacks * @buf_ops: callbacks to deliver buffer information. * between user-space and kernel-space. * @drv_priv: driver private data. * @subsystem_flags: Flags specific to the subsystem (V4L2/DVB/etc.). Not used * by the vb2 core. * @buf_struct_size: size of the driver-specific buffer structure; * "0" indicates the driver doesn't want to use a custom buffer * structure type. In that case a subsystem-specific struct * will be used (in the case of V4L2 that is * ``sizeof(struct vb2_v4l2_buffer)``). The first field of the * driver-specific buffer structure must be the subsystem-specific * struct (vb2_v4l2_buffer in the case of V4L2). * @timestamp_flags: Timestamp flags; ``V4L2_BUF_FLAG_TIMESTAMP_*`` and * ``V4L2_BUF_FLAG_TSTAMP_SRC_*`` * @gfp_flags: additional gfp flags used when allocating the buffers. * Typically this is 0, but it may be e.g. %GFP_DMA or %__GFP_DMA32 * to force the buffer allocation to a specific memory zone. * @min_queued_buffers: the minimum number of queued buffers needed before * @start_streaming can be called. Used when a DMA engine * cannot be started unless at least this number of buffers * have been queued into the driver. * VIDIOC_REQBUFS will ensure at least @min_queued_buffers + 1 * buffers will be allocated. Note that VIDIOC_CREATE_BUFS will not * modify the requested buffer count. * @min_reqbufs_allocation: the minimum number of buffers to be allocated when * calling VIDIOC_REQBUFS. Note that VIDIOC_CREATE_BUFS will *not* * modify the requested buffer count and does not use this field. * Drivers can set this if there has to be a certain number of * buffers available for the hardware to work effectively. * This allows calling VIDIOC_REQBUFS with a buffer count of 1 and * it will be automatically adjusted to a workable buffer count. * If set, then @min_reqbufs_allocation must be larger than * @min_queued_buffers + 1. * If this field is > 3, then it is highly recommended that the * driver implements the V4L2_CID_MIN_BUFFERS_FOR_CAPTURE/OUTPUT * control. * @alloc_devs: &struct device memory type/allocator-specific per-plane device */ /* * Private elements (won't appear at the uAPI book): * @mmap_lock: private mutex used when buffers are allocated/freed/mmapped * @memory: current memory type used * @dma_dir: DMA mapping direction. * @bufs: videobuf2 buffer structures. If it is non-NULL then * bufs_bitmap is also non-NULL. * @bufs_bitmap: bitmap tracking whether each bufs[] entry is used * @max_num_buffers: upper limit of number of allocated/used buffers. * If set to 0 v4l2 core will change it VB2_MAX_FRAME * for backward compatibility. * @queued_list: list of buffers currently queued from userspace * @queued_count: number of buffers queued and ready for streaming. * @owned_by_drv_count: number of buffers owned by the driver * @done_list: list of buffers ready to be dequeued to userspace * @done_lock: lock to protect done_list list * @done_wq: waitqueue for processes waiting for buffers ready to be dequeued * @streaming: current streaming state * @start_streaming_called: @start_streaming was called successfully and we * started streaming. * @error: a fatal error occurred on the queue * @waiting_for_buffers: used in poll() to check if vb2 is still waiting for * buffers. Only set for capture queues if qbuf has not yet been * called since poll() needs to return %EPOLLERR in that situation. * @waiting_in_dqbuf: set by the core for the duration of a blocking DQBUF, when * it has to wait for a buffer to become available with vb2_queue->lock * released. Used to prevent destroying the queue by other threads. * @is_multiplanar: set if buffer type is multiplanar * @is_output: set if buffer type is output * @is_busy: set if at least one buffer has been allocated at some time. * @copy_timestamp: set if vb2-core should set timestamps * @last_buffer_dequeued: used in poll() and DQBUF to immediately return if the * last decoded buffer was already dequeued. Set for capture queues * when a buffer with the %V4L2_BUF_FLAG_LAST is dequeued. * @fileio: file io emulator internal data, used only if emulator is active * @threadio: thread io internal data, used only if thread is active * @name: queue name, used for logging purpose. Initialized automatically * if left empty by drivers. */ struct vb2_queue { unsigned int type; unsigned int io_modes; struct device *dev; unsigned long dma_attrs; unsigned int bidirectional:1; unsigned int fileio_read_once:1; unsigned int fileio_write_immediately:1; unsigned int allow_zero_bytesused:1; unsigned int quirk_poll_must_check_waiting_for_buffers:1; unsigned int supports_requests:1; unsigned int requires_requests:1; unsigned int uses_qbuf:1; unsigned int uses_requests:1; unsigned int allow_cache_hints:1; unsigned int non_coherent_mem:1; struct mutex *lock; void *owner; const struct vb2_ops *ops; const struct vb2_mem_ops *mem_ops; const struct vb2_buf_ops *buf_ops; void *drv_priv; u32 subsystem_flags; unsigned int buf_struct_size; u32 timestamp_flags; gfp_t gfp_flags; u32 min_queued_buffers; u32 min_reqbufs_allocation; struct device *alloc_devs[VB2_MAX_PLANES]; /* private: internal use only */ struct mutex mmap_lock; unsigned int memory; enum dma_data_direction dma_dir; struct vb2_buffer **bufs; unsigned long *bufs_bitmap; unsigned int max_num_buffers; struct list_head queued_list; unsigned int queued_count; atomic_t owned_by_drv_count; struct list_head done_list; spinlock_t done_lock; wait_queue_head_t done_wq; unsigned int streaming:1; unsigned int start_streaming_called:1; unsigned int error:1; unsigned int waiting_for_buffers:1; unsigned int waiting_in_dqbuf:1; unsigned int is_multiplanar:1; unsigned int is_output:1; unsigned int is_busy:1; unsigned int copy_timestamp:1; unsigned int last_buffer_dequeued:1; struct vb2_fileio_data *fileio; struct vb2_threadio_data *threadio; char name[32]; #ifdef CONFIG_VIDEO_ADV_DEBUG /* * Counters for how often these queue-related ops are * called. Used to check for unbalanced ops. */ u32 cnt_queue_setup; u32 cnt_wait_prepare; u32 cnt_wait_finish; u32 cnt_prepare_streaming; u32 cnt_start_streaming; u32 cnt_stop_streaming; u32 cnt_unprepare_streaming; #endif }; /** * vb2_queue_allows_cache_hints() - Return true if the queue allows cache * and memory consistency hints. * * @q: pointer to &struct vb2_queue with videobuf2 queue */ static inline bool vb2_queue_allows_cache_hints(struct vb2_queue *q) { return q->allow_cache_hints && q->memory == VB2_MEMORY_MMAP; } /** * vb2_plane_vaddr() - Return a kernel virtual address of a given plane. * @vb: pointer to &struct vb2_buffer to which the plane in * question belongs to. * @plane_no: plane number for which the address is to be returned. * * This function returns a kernel virtual address of a given plane if * such a mapping exist, NULL otherwise. */ void *vb2_plane_vaddr(struct vb2_buffer *vb, unsigned int plane_no); /** * vb2_plane_cookie() - Return allocator specific cookie for the given plane. * @vb: pointer to &struct vb2_buffer to which the plane in * question belongs to. * @plane_no: plane number for which the cookie is to be returned. * * This function returns an allocator specific cookie for a given plane if * available, NULL otherwise. The allocator should provide some simple static * inline function, which would convert this cookie to the allocator specific * type that can be used directly by the driver to access the buffer. This can * be for example physical address, pointer to scatter list or IOMMU mapping. */ void *vb2_plane_cookie(struct vb2_buffer *vb, unsigned int plane_no); /** * vb2_buffer_done() - inform videobuf2 that an operation on a buffer * is finished. * @vb: pointer to &struct vb2_buffer to be used. * @state: state of the buffer, as defined by &enum vb2_buffer_state. * Either %VB2_BUF_STATE_DONE if the operation finished * successfully, %VB2_BUF_STATE_ERROR if the operation finished * with an error or %VB2_BUF_STATE_QUEUED. * * This function should be called by the driver after a hardware operation on * a buffer is finished and the buffer may be returned to userspace. The driver * cannot use this buffer anymore until it is queued back to it by videobuf * by the means of &vb2_ops->buf_queue callback. Only buffers previously queued * to the driver by &vb2_ops->buf_queue can be passed to this function. * * While streaming a buffer can only be returned in state DONE or ERROR. * The &vb2_ops->start_streaming op can also return them in case the DMA engine * cannot be started for some reason. In that case the buffers should be * returned with state QUEUED to put them back into the queue. */ void vb2_buffer_done(struct vb2_buffer *vb, enum vb2_buffer_state state); /** * vb2_discard_done() - discard all buffers marked as DONE. * @q: pointer to &struct vb2_queue with videobuf2 queue. * * This function is intended to be used with suspend/resume operations. It * discards all 'done' buffers as they would be too old to be requested after * resume. * * Drivers must stop the hardware and synchronize with interrupt handlers and/or * delayed works before calling this function to make sure no buffer will be * touched by the driver and/or hardware. */ void vb2_discard_done(struct vb2_queue *q); /** * vb2_wait_for_all_buffers() - wait until all buffers are given back to vb2. * @q: pointer to &struct vb2_queue with videobuf2 queue. * * This function will wait until all buffers that have been given to the driver * by &vb2_ops->buf_queue are given back to vb2 with vb2_buffer_done(). It * doesn't call &vb2_ops->wait_prepare/&vb2_ops->wait_finish pair. * It is intended to be called with all locks taken, for example from * &vb2_ops->stop_streaming callback. */ int vb2_wait_for_all_buffers(struct vb2_queue *q); /** * vb2_core_querybuf() - query video buffer information. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @vb: pointer to struct &vb2_buffer. * @pb: buffer struct passed from userspace. * * Videobuf2 core helper to implement VIDIOC_QUERYBUF() operation. It is called * internally by VB2 by an API-specific handler, like ``videobuf2-v4l2.h``. * * The passed buffer should have been verified. * * This function fills the relevant information for the userspace. * * Return: returns zero on success; an error code otherwise. */ void vb2_core_querybuf(struct vb2_queue *q, struct vb2_buffer *vb, void *pb); /** * vb2_core_reqbufs() - Initiate streaming. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @memory: memory type, as defined by &enum vb2_memory. * @flags: auxiliary queue/buffer management flags. Currently, the only * used flag is %V4L2_MEMORY_FLAG_NON_COHERENT. * @count: requested buffer count. * * Videobuf2 core helper to implement VIDIOC_REQBUF() operation. It is called * internally by VB2 by an API-specific handler, like ``videobuf2-v4l2.h``. * * This function: * * #) verifies streaming parameters passed from the userspace; * #) sets up the queue; * #) negotiates number of buffers and planes per buffer with the driver * to be used during streaming; * #) allocates internal buffer structures (&struct vb2_buffer), according to * the agreed parameters; * #) for MMAP memory type, allocates actual video memory, using the * memory handling/allocation routines provided during queue initialization. * * If req->count is 0, all the memory will be freed instead. * * If the queue has been allocated previously by a previous vb2_core_reqbufs() * call and the queue is not busy, memory will be reallocated. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory, unsigned int flags, unsigned int *count); /** * vb2_core_create_bufs() - Allocate buffers and any required auxiliary structs * @q: pointer to &struct vb2_queue with videobuf2 queue. * @memory: memory type, as defined by &enum vb2_memory. * @flags: auxiliary queue/buffer management flags. * @count: requested buffer count. * @requested_planes: number of planes requested. * @requested_sizes: array with the size of the planes. * @first_index: index of the first created buffer, all allocated buffers have * indices in the range [first_index..first_index+count-1] * * Videobuf2 core helper to implement VIDIOC_CREATE_BUFS() operation. It is * called internally by VB2 by an API-specific handler, like * ``videobuf2-v4l2.h``. * * This function: * * #) verifies parameter sanity; * #) calls the &vb2_ops->queue_setup queue operation; * #) performs any necessary memory allocations. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory, unsigned int flags, unsigned int *count, unsigned int requested_planes, const unsigned int requested_sizes[], unsigned int *first_index); /** * vb2_core_prepare_buf() - Pass ownership of a buffer from userspace * to the kernel. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @vb: pointer to struct &vb2_buffer. * @pb: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_prepare_buf handler in driver. * * Videobuf2 core helper to implement VIDIOC_PREPARE_BUF() operation. It is * called internally by VB2 by an API-specific handler, like * ``videobuf2-v4l2.h``. * * The passed buffer should have been verified. * * This function calls vb2_ops->buf_prepare callback in the driver * (if provided), in which driver-specific buffer initialization can * be performed. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_prepare_buf(struct vb2_queue *q, struct vb2_buffer *vb, void *pb); /** * vb2_core_remove_bufs() - * @q: pointer to &struct vb2_queue with videobuf2 queue. * @start: first index of the range of buffers to remove. * @count: number of buffers to remove. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_remove_bufs(struct vb2_queue *q, unsigned int start, unsigned int count); /** * vb2_core_qbuf() - Queue a buffer from userspace * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @vb: pointer to struct &vb2_buffer. * @pb: buffer structure passed from userspace to * v4l2_ioctl_ops->vidioc_qbuf handler in driver * @req: pointer to &struct media_request, may be NULL. * * Videobuf2 core helper to implement VIDIOC_QBUF() operation. It is called * internally by VB2 by an API-specific handler, like ``videobuf2-v4l2.h``. * * This function: * * #) If @req is non-NULL, then the buffer will be bound to this * media request and it returns. The buffer will be prepared and * queued to the driver (i.e. the next two steps) when the request * itself is queued. * #) if necessary, calls &vb2_ops->buf_prepare callback in the driver * (if provided), in which driver-specific buffer initialization can * be performed; * #) if streaming is on, queues the buffer in driver by the means of * &vb2_ops->buf_queue callback for processing. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_qbuf(struct vb2_queue *q, struct vb2_buffer *vb, void *pb, struct media_request *req); /** * vb2_core_dqbuf() - Dequeue a buffer to the userspace * @q: pointer to &struct vb2_queue with videobuf2 queue * @pindex: pointer to the buffer index. May be NULL * @pb: buffer structure passed from userspace to * v4l2_ioctl_ops->vidioc_dqbuf handler in driver. * @nonblocking: if true, this call will not sleep waiting for a buffer if no * buffers ready for dequeuing are present. Normally the driver * would be passing (file->f_flags & O_NONBLOCK) here. * * Videobuf2 core helper to implement VIDIOC_DQBUF() operation. It is called * internally by VB2 by an API-specific handler, like ``videobuf2-v4l2.h``. * * This function: * * #) calls buf_finish callback in the driver (if provided), in which * driver can perform any additional operations that may be required before * returning the buffer to userspace, such as cache sync, * #) the buffer struct members are filled with relevant information for * the userspace. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_dqbuf(struct vb2_queue *q, unsigned int *pindex, void *pb, bool nonblocking); /** * vb2_core_streamon() - Implements VB2 stream ON logic * * @q: pointer to &struct vb2_queue with videobuf2 queue * @type: type of the queue to be started. * For V4L2, this is defined by &enum v4l2_buf_type type. * * Videobuf2 core helper to implement VIDIOC_STREAMON() operation. It is called * internally by VB2 by an API-specific handler, like ``videobuf2-v4l2.h``. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_streamon(struct vb2_queue *q, unsigned int type); /** * vb2_core_streamoff() - Implements VB2 stream OFF logic * * @q: pointer to &struct vb2_queue with videobuf2 queue * @type: type of the queue to be started. * For V4L2, this is defined by &enum v4l2_buf_type type. * * Videobuf2 core helper to implement VIDIOC_STREAMOFF() operation. It is * called internally by VB2 by an API-specific handler, like * ``videobuf2-v4l2.h``. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_streamoff(struct vb2_queue *q, unsigned int type); /** * vb2_core_expbuf() - Export a buffer as a file descriptor. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @fd: pointer to the file descriptor associated with DMABUF * (set by driver). * @type: buffer type. * @vb: pointer to struct &vb2_buffer. * @plane: index of the plane to be exported, 0 for single plane queues * @flags: file flags for newly created file, as defined at * include/uapi/asm-generic/fcntl.h. * Currently, the only used flag is %O_CLOEXEC. * is supported, refer to manual of open syscall for more details. * * * Videobuf2 core helper to implement VIDIOC_EXPBUF() operation. It is called * internally by VB2 by an API-specific handler, like ``videobuf2-v4l2.h``. * * Return: returns zero on success; an error code otherwise. */ int vb2_core_expbuf(struct vb2_queue *q, int *fd, unsigned int type, struct vb2_buffer *vb, unsigned int plane, unsigned int flags); /** * vb2_core_queue_init() - initialize a videobuf2 queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * This structure should be allocated in driver * * The &vb2_queue structure should be allocated by the driver. The driver is * responsible of clearing it's content and setting initial values for some * required entries before calling this function. * * .. note:: * * The following fields at @q should be set before calling this function: * &vb2_queue->ops, &vb2_queue->mem_ops, &vb2_queue->type. */ int vb2_core_queue_init(struct vb2_queue *q); /** * vb2_core_queue_release() - stop streaming, release the queue and free memory * @q: pointer to &struct vb2_queue with videobuf2 queue. * * This function stops streaming and performs necessary clean ups, including * freeing video buffer memory. The driver is responsible for freeing * the &struct vb2_queue itself. */ void vb2_core_queue_release(struct vb2_queue *q); /** * vb2_queue_error() - signal a fatal error on the queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * * Flag that a fatal unrecoverable error has occurred and wake up all processes * waiting on the queue. Polling will now set %EPOLLERR and queuing and dequeuing * buffers will return %-EIO. * * The error flag will be cleared when canceling the queue, either from * vb2_streamoff() or vb2_queue_release(). Drivers should thus not call this * function before starting the stream, otherwise the error flag will remain set * until the queue is released when closing the device node. */ void vb2_queue_error(struct vb2_queue *q); /** * vb2_mmap() - map video buffers into application address space. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @vma: pointer to &struct vm_area_struct with the vma passed * to the mmap file operation handler in the driver. * * Should be called from mmap file operation handler of a driver. * This function maps one plane of one of the available video buffers to * userspace. To map whole video memory allocated on reqbufs, this function * has to be called once per each plane per each buffer previously allocated. * * When the userspace application calls mmap, it passes to it an offset returned * to it earlier by the means of &v4l2_ioctl_ops->vidioc_querybuf handler. * That offset acts as a "cookie", which is then used to identify the plane * to be mapped. * * This function finds a plane with a matching offset and a mapping is performed * by the means of a provided memory operation. * * The return values from this function are intended to be directly returned * from the mmap handler in driver. */ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma); #ifndef CONFIG_MMU /** * vb2_get_unmapped_area - map video buffers into application address space. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @addr: memory address. * @len: buffer size. * @pgoff: page offset. * @flags: memory flags. * * This function is used in noMMU platforms to propose address mapping * for a given buffer. It's intended to be used as a handler for the * &file_operations->get_unmapped_area operation. * * This is called by the mmap() syscall routines will call this * to get a proposed address for the mapping, when ``!CONFIG_MMU``. */ unsigned long vb2_get_unmapped_area(struct vb2_queue *q, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif /** * vb2_core_poll() - implements poll syscall() logic. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @file: &struct file argument passed to the poll * file operation handler. * @wait: &poll_table wait argument passed to the poll * file operation handler. * * This function implements poll file operation handler for a driver. * For CAPTURE queues, if a buffer is ready to be dequeued, the userspace will * be informed that the file descriptor of a video device is available for * reading. * For OUTPUT queues, if a buffer is ready to be dequeued, the file descriptor * will be reported as available for writing. * * The return values from this function are intended to be directly returned * from poll handler in driver. */ __poll_t vb2_core_poll(struct vb2_queue *q, struct file *file, poll_table *wait); /** * vb2_read() - implements read() syscall logic. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @data: pointed to target userspace buffer * @count: number of bytes to read * @ppos: file handle position tracking pointer * @nonblock: mode selector (1 means blocking calls, 0 means nonblocking) */ size_t vb2_read(struct vb2_queue *q, char __user *data, size_t count, loff_t *ppos, int nonblock); /** * vb2_write() - implements write() syscall logic. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @data: pointed to target userspace buffer * @count: number of bytes to write * @ppos: file handle position tracking pointer * @nonblock: mode selector (1 means blocking calls, 0 means nonblocking) */ size_t vb2_write(struct vb2_queue *q, const char __user *data, size_t count, loff_t *ppos, int nonblock); /** * typedef vb2_thread_fnc - callback function for use with vb2_thread. * * @vb: pointer to struct &vb2_buffer. * @priv: pointer to a private data. * * This is called whenever a buffer is dequeued in the thread. */ typedef int (*vb2_thread_fnc)(struct vb2_buffer *vb, void *priv); /** * vb2_thread_start() - start a thread for the given queue. * @q: pointer to &struct vb2_queue with videobuf2 queue. * @fnc: &vb2_thread_fnc callback function. * @priv: priv pointer passed to the callback function. * @thread_name:the name of the thread. This will be prefixed with "vb2-". * * This starts a thread that will queue and dequeue until an error occurs * or vb2_thread_stop() is called. * * .. attention:: * * This function should not be used for anything else but the videobuf2-dvb * support. If you think you have another good use-case for this, then please * contact the linux-media mailing list first. */ int vb2_thread_start(struct vb2_queue *q, vb2_thread_fnc fnc, void *priv, const char *thread_name); /** * vb2_thread_stop() - stop the thread for the given queue. * @q: pointer to &struct vb2_queue with videobuf2 queue. */ int vb2_thread_stop(struct vb2_queue *q); /** * vb2_is_streaming() - return streaming status of the queue. * @q: pointer to &struct vb2_queue with videobuf2 queue. */ static inline bool vb2_is_streaming(struct vb2_queue *q) { return q->streaming; } /** * vb2_fileio_is_active() - return true if fileio is active. * @q: pointer to &struct vb2_queue with videobuf2 queue. * * This returns true if read() or write() is used to stream the data * as opposed to stream I/O. This is almost never an important distinction, * except in rare cases. One such case is that using read() or write() to * stream a format using %V4L2_FIELD_ALTERNATE is not allowed since there * is no way you can pass the field information of each buffer to/from * userspace. A driver that supports this field format should check for * this in the &vb2_ops->queue_setup op and reject it if this function returns * true. */ static inline bool vb2_fileio_is_active(struct vb2_queue *q) { return q->fileio; } /** * vb2_get_num_buffers() - get the number of buffer in a queue * @q: pointer to &struct vb2_queue with videobuf2 queue. */ static inline unsigned int vb2_get_num_buffers(struct vb2_queue *q) { if (q->bufs_bitmap) return bitmap_weight(q->bufs_bitmap, q->max_num_buffers); return 0; } /** * vb2_is_busy() - return busy status of the queue. * @q: pointer to &struct vb2_queue with videobuf2 queue. * * This function checks if queue has any buffers allocated. */ static inline bool vb2_is_busy(struct vb2_queue *q) { return !!q->is_busy; } /** * vb2_get_drv_priv() - return driver private data associated with the queue. * @q: pointer to &struct vb2_queue with videobuf2 queue. */ static inline void *vb2_get_drv_priv(struct vb2_queue *q) { return q->drv_priv; } /** * vb2_set_plane_payload() - set bytesused for the plane @plane_no. * @vb: pointer to &struct vb2_buffer to which the plane in * question belongs to. * @plane_no: plane number for which payload should be set. * @size: payload in bytes. */ static inline void vb2_set_plane_payload(struct vb2_buffer *vb, unsigned int plane_no, unsigned long size) { /* * size must never be larger than the buffer length, so * warn and clamp to the buffer length if that's the case. */ if (plane_no < vb->num_planes) { if (WARN_ON_ONCE(size > vb->planes[plane_no].length)) size = vb->planes[plane_no].length; vb->planes[plane_no].bytesused = size; } } /** * vb2_get_plane_payload() - get bytesused for the plane plane_no * @vb: pointer to &struct vb2_buffer to which the plane in * question belongs to. * @plane_no: plane number for which payload should be set. */ static inline unsigned long vb2_get_plane_payload(struct vb2_buffer *vb, unsigned int plane_no) { if (plane_no < vb->num_planes) return vb->planes[plane_no].bytesused; return 0; } /** * vb2_plane_size() - return plane size in bytes. * @vb: pointer to &struct vb2_buffer to which the plane in * question belongs to. * @plane_no: plane number for which size should be returned. */ static inline unsigned long vb2_plane_size(struct vb2_buffer *vb, unsigned int plane_no) { if (plane_no < vb->num_planes) return vb->planes[plane_no].length; return 0; } /** * vb2_start_streaming_called() - return streaming status of driver. * @q: pointer to &struct vb2_queue with videobuf2 queue. */ static inline bool vb2_start_streaming_called(struct vb2_queue *q) { return q->start_streaming_called; } /** * vb2_clear_last_buffer_dequeued() - clear last buffer dequeued flag of queue. * @q: pointer to &struct vb2_queue with videobuf2 queue. */ static inline void vb2_clear_last_buffer_dequeued(struct vb2_queue *q) { q->last_buffer_dequeued = false; } /** * vb2_get_buffer() - get a buffer from a queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * @index: buffer index * * This function obtains a buffer from a queue, by its index. * Keep in mind that there is no refcounting involved in this * operation, so the buffer lifetime should be taken into * consideration. */ static inline struct vb2_buffer *vb2_get_buffer(struct vb2_queue *q, unsigned int index) { if (!q->bufs) return NULL; if (index >= q->max_num_buffers) return NULL; if (test_bit(index, q->bufs_bitmap)) return q->bufs[index]; return NULL; } /* * The following functions are not part of the vb2 core API, but are useful * functions for videobuf2-*. */ /** * vb2_buffer_in_use() - return true if the buffer is in use and * the queue cannot be freed (by the means of VIDIOC_REQBUFS(0)) call. * * @vb: buffer for which plane size should be returned. * @q: pointer to &struct vb2_queue with videobuf2 queue. */ bool vb2_buffer_in_use(struct vb2_queue *q, struct vb2_buffer *vb); /** * vb2_verify_memory_type() - Check whether the memory type and buffer type * passed to a buffer operation are compatible with the queue. * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @memory: memory model, as defined by enum &vb2_memory. * @type: private buffer type whose content is defined by the vb2-core * caller. For example, for V4L2, it should match * the types defined on enum &v4l2_buf_type. */ int vb2_verify_memory_type(struct vb2_queue *q, enum vb2_memory memory, unsigned int type); /** * vb2_request_object_is_buffer() - return true if the object is a buffer * * @obj: the request object. */ bool vb2_request_object_is_buffer(struct media_request_object *obj); /** * vb2_request_buffer_cnt() - return the number of buffers in the request * * @req: the request. */ unsigned int vb2_request_buffer_cnt(struct media_request *req); #endif /* _MEDIA_VIDEOBUF2_CORE_H */ |
| 136 1 129 129 92 1 91 133 128 128 128 3 328 132 284 283 52 52 21 33 21 8 9 224 224 133 133 57 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "accessors.h" #include "dir-item.h" /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On * success a struct btrfs_dir_item pointer is returned, otherwise it is * an ERR_PTR. * * The name is not copied into the dir item, you have to do that yourself. */ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 data_size, const char *name, int name_len) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; struct extent_buffer *leaf; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (di) return ERR_PTR(-EEXIST); btrfs_extend_item(trans, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; ptr = btrfs_item_ptr(leaf, path->slots[0], char); ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0])); ptr += btrfs_item_size(leaf, path->slots[0]) - data_size; return (struct btrfs_dir_item *)ptr; } /* * xattrs work a lot like directories, this inserts an xattr item * into the tree */ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, const char *name, u16 name_len, const void *data, u16 data_len) { int ret = 0; struct btrfs_dir_item *dir_item; unsigned long name_ptr, data_ptr; struct btrfs_key key, location; struct btrfs_disk_key disk_key; struct extent_buffer *leaf; u32 data_size; if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) return -ENOSPC; key.objectid = objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); if (IS_ERR(dir_item)) return PTR_ERR(dir_item); memset(&location, 0, sizeof(location)); leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, &location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR); btrfs_set_dir_name_len(leaf, dir_item, name_len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); btrfs_set_dir_data_len(leaf, dir_item, data_len); name_ptr = (unsigned long)(dir_item + 1); data_ptr = (unsigned long)((char *)name_ptr + name_len); write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } /* * insert a directory item in the tree, doing all the magic for * both indexes. 'dir' indicates which objectid to insert it into, * 'location' is the key to stuff into the directory item, 'type' is the * type of the inode we're pointing to, and 'index' is the sequence number * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, const struct btrfs_key *location, u8 type, u64 index) { int ret = 0; int ret2 = 0; struct btrfs_root *root = dir->root; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; unsigned long name_ptr; struct btrfs_key key; struct btrfs_disk_key disk_key; u32 data_size; key.objectid = btrfs_ino(dir); key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); path = btrfs_alloc_path(); if (!path) return -ENOMEM; btrfs_cpu_key_to_disk(&disk_key, location); data_size = sizeof(*dir_item) + name->len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name->name, name->len); if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); if (ret == -EEXIST) goto second_insert; goto out_free; } if (IS_ENCRYPTED(&dir->vfs_inode)) type |= BTRFS_FT_ENCRYPTED; leaf = path->nodes[0]; btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_flags(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ if (root == root->fs_info->tree_root) { ret = 0; goto out_free; } btrfs_release_path(path); ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); if (ret) return ret; if (ret2) return ret2; return 0; } static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, const char *name, int name_len, int mod) { const int ins_len = (mod < 0 ? -1 : 0); const int cow = (mod != 0); int ret; ret = btrfs_search_slot(trans, root, key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); } /* * Lookup for a directory item by name. * * @trans: The transaction handle to use. Can be NULL if @mod is 0. * @root: The root of the target tree. * @path: Path to use for the search. * @dir: The inode number (objectid) of the directory. * @name: The name associated to the directory entry we are looking for. * @name_len: The length of the name. * @mod: Used to indicate if the tree search is meant for a read only * lookup, for a modification lookup or for a deletion lookup, so * its value should be 0, 1 or -1, respectively. * * Returns: NULL if the dir item does not exists, an error pointer if an error * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const struct fscrypt_str *name, int mod) { struct btrfs_key key; struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, name->len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; return di; } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name) { int ret; struct btrfs_key key; struct btrfs_dir_item *di; int data_size; struct extent_buffer *leaf; int slot; struct btrfs_path *path; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name, name->len, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ if (ret == -ENOENT) { ret = 0; goto out; } if (ret < 0) goto out; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ ret = -EEXIST; goto out; } /* See if there is room in the item to insert this name. */ data_size = sizeof(*di) + name->len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { ret = -EOVERFLOW; } else { /* plenty of insertion room */ ret = 0; } out: btrfs_free_path(path); return ret; } /* * Lookup for a directory index item by name and index number. * * @trans: The transaction handle to use. Can be NULL if @mod is 0. * @root: The root of the target tree. * @path: Path to use for the search. * @dir: The inode number (objectid) of the directory. * @index: The index number. * @name: The name associated to the directory entry we are looking for. * @name_len: The length of the name. * @mod: Used to indicate if the tree search is meant for a read only * lookup, for a modification lookup or for a deletion lookup, so * its value should be 0, 1 or -1, respectively. * * Returns: NULL if the dir index item does not exists, an error pointer if an * error happened, or a pointer to a dir item if the dir index item exists and * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, u64 index, const struct fscrypt_str *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = index; di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, name->len, mod); if (di == ERR_PTR(-ENOENT)) return NULL; return di; } struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; key.objectid = dirid; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; btrfs_for_each_slot(root, &key, &key, path, ret) { if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; di = btrfs_match_dir_item_name(root->fs_info, path, name->name, name->len); if (di) return di; } /* Adjust return code if the key was not found in the next leaf. */ if (ret > 0) ret = 0; return ERR_PTR(ret); } struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod) { struct btrfs_key key; struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; return di; } /* * helper function to look at the directory item pointed to by 'path' * this walks through all the entries in a dir item and finds one * for a specific name. */ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, const struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; unsigned long name_ptr; u32 total_len; u32 cur = 0; u32 this_len; struct extent_buffer *leaf; leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); total_len = btrfs_item_size(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + btrfs_dir_name_len(leaf, dir_item) + btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; cur += this_len; dir_item = (struct btrfs_dir_item *)((char *)dir_item + this_len); } return NULL; } /* * given a pointer into a directory item, delete it. This * handles items that have more than one entry in them. */ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_dir_item *di) { struct extent_buffer *leaf; u32 sub_item_len; u32 item_len; int ret = 0; leaf = path->nodes[0]; sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di); item_len = btrfs_item_size(leaf, path->slots[0]); if (sub_item_len == item_len) { ret = btrfs_del_item(trans, root, path); } else { /* MARKER */ unsigned long ptr = (unsigned long)di; unsigned long start; start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); btrfs_truncate_item(trans, path, item_len - sub_item_len, 1); } return ret; } |
| 2863 2 1343 1 4442 3214 2742 1316 78 8040 579 3606 2926 8412 1691 8426 4439 4047 8013 999 8040 8041 8039 42 42 42 8407 5526 8006 11 5243 5137 5132 1 410 411 2823 4275 1 3390 3248 1314 14 4266 14 2368 3503 1426 1445 1323 356 356 355 1 245 149 355 149 1871 1663 1481 893 897 869 1741 4289 2325 2361 838 2807 2190 2189 2 2191 411 2186 1983 647 2185 5304 4285 4293 4298 1 3454 4291 3800 5313 2703 2697 1 2459 3531 3519 295 5163 4614 2002 5740 2399 8 5745 5 5748 3312 5159 5141 78 5299 2403 5743 5752 4615 52 62 1998 2033 67 2029 5172 2 1 3570 3559 3195 3181 2236 2863 3259 996 2162 2749 1317 1109 1006 1343 1545 10 2413 1544 11 11 11 11 11 11 11 11 11 6 4 4 4 2 69 68 24 1 69 23 23 23 5 21 2 2 2 281 52 234 211 52 28 28 27 27 1 28 28 26 4314 1050 217 1147 4325 2443 15 2918 2911 2001 2238 872 2055 2010 3709 775 424 1141 3714 2398 2534 2345 1916 1761 1139 2349 2344 16 2351 2348 717 2163 1491 13 1474 563 2586 4790 3636 1713 3563 3556 738 3498 49 49 48 6891 6900 6898 288 376 39 73 60 60 332 318 14 332 242 286 287 287 60 59 60 17 17 16 5 5 5 5 3261 3260 509 507 2 2 2 2 2333 2336 2338 4 2337 2048 457 25 449 445 448 448 25 25 450 449 25 4 4 4 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 | // SPDX-License-Identifier: GPL-2.0+ /* * XArray implementation * Copyright (c) 2017-2018 Microsoft Corporation * Copyright (c) 2018-2020 Oracle * Author: Matthew Wilcox <willy@infradead.org> */ #include <linux/bitmap.h> #include <linux/export.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Coding conventions in this file: * * @xa is used to refer to the entire xarray. * @xas is the 'xarray operation state'. It may be either a pointer to * an xa_state, or an xa_state stored on the stack. This is an unfortunate * ambiguity. * @index is the index of the entry being operated on * @mark is an xa_mark_t; a small number indicating one of the mark bits. * @node refers to an xa_node; usually the primary one being operated on by * this function. * @offset is the index into the slots array inside an xa_node. * @parent refers to the @xa_node closer to the head than @node. * @entry refers to something stored in a slot in the xarray */ static inline unsigned int xa_lock_type(const struct xarray *xa) { return (__force unsigned int)xa->xa_flags & 3; } static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) { if (lock_type == XA_LOCK_IRQ) xas_lock_irq(xas); else if (lock_type == XA_LOCK_BH) xas_lock_bh(xas); else xas_lock(xas); } static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) { if (lock_type == XA_LOCK_IRQ) xas_unlock_irq(xas); else if (lock_type == XA_LOCK_BH) xas_unlock_bh(xas); else xas_unlock(xas); } static inline bool xa_track_free(const struct xarray *xa) { return xa->xa_flags & XA_FLAGS_TRACK_FREE; } static inline bool xa_zero_busy(const struct xarray *xa) { return xa->xa_flags & XA_FLAGS_ZERO_BUSY; } static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) { if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) xa->xa_flags |= XA_FLAGS_MARK(mark); } static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) { if (xa->xa_flags & XA_FLAGS_MARK(mark)) xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); } static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) { return node->marks[(__force unsigned)mark]; } static inline bool node_get_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return test_bit(offset, node_marks(node, mark)); } /* returns true if the bit was set */ static inline bool node_set_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return __test_and_set_bit(offset, node_marks(node, mark)); } /* returns true if the bit was set */ static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, xa_mark_t mark) { return __test_and_clear_bit(offset, node_marks(node, mark)); } static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) { return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); } static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) { bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE); } #define mark_inc(mark) do { \ mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ } while (0) /* * xas_squash_marks() - Merge all marks to the first entry * @xas: Array operation state. * * Set a mark on the first entry if any entry has it set. Clear marks on * all sibling entries. */ static void xas_squash_marks(const struct xa_state *xas) { unsigned int mark = 0; unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; if (!xas->xa_sibs) return; do { unsigned long *marks = xas->xa_node->marks[mark]; if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) continue; __set_bit(xas->xa_offset, marks); bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); } while (mark++ != (__force unsigned)XA_MARK_MAX); } /* extracts the offset within this node from the index */ static unsigned int get_offset(unsigned long index, struct xa_node *node) { return (index >> node->shift) & XA_CHUNK_MASK; } static void xas_set_offset(struct xa_state *xas) { xas->xa_offset = get_offset(xas->xa_index, xas->xa_node); } /* move the index either forwards (find) or backwards (sibling slot) */ static void xas_move_index(struct xa_state *xas, unsigned long offset) { unsigned int shift = xas->xa_node->shift; xas->xa_index &= ~XA_CHUNK_MASK << shift; xas->xa_index += offset << shift; } static void xas_next_offset(struct xa_state *xas) { xas->xa_offset++; xas_move_index(xas, xas->xa_offset); } static void *set_bounds(struct xa_state *xas) { xas->xa_node = XAS_BOUNDS; return NULL; } /* * Starts a walk. If the @xas is already valid, we assume that it's on * the right path and just return where we've got to. If we're in an * error state, return NULL. If the index is outside the current scope * of the xarray, return NULL without changing @xas->xa_node. Otherwise * set @xas->xa_node to NULL and return the current head of the array. */ static void *xas_start(struct xa_state *xas) { void *entry; if (xas_valid(xas)) return xas_reload(xas); if (xas_error(xas)) return NULL; entry = xa_head(xas->xa); if (!xa_is_node(entry)) { if (xas->xa_index) return set_bounds(xas); } else { if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) return set_bounds(xas); } xas->xa_node = NULL; return entry; } static __always_inline void *xas_descend(struct xa_state *xas, struct xa_node *node) { unsigned int offset = get_offset(xas->xa_index, node); void *entry = xa_entry(xas->xa, node, offset); xas->xa_node = node; while (xa_is_sibling(entry)) { offset = xa_to_sibling(entry); entry = xa_entry(xas->xa, node, offset); if (node->shift && xa_is_node(entry)) entry = XA_RETRY_ENTRY; } xas->xa_offset = offset; return entry; } /** * xas_load() - Load an entry from the XArray (advanced). * @xas: XArray operation state. * * Usually walks the @xas to the appropriate state to load the entry * stored at xa_index. However, it will do nothing and return %NULL if * @xas is in an error state. xas_load() will never expand the tree. * * If the xa_state is set up to operate on a multi-index entry, xas_load() * may return %NULL or an internal entry, even if there are entries * present within the range specified by @xas. * * Context: Any context. The caller should hold the xa_lock or the RCU lock. * Return: Usually an entry in the XArray, but see description for exceptions. */ void *xas_load(struct xa_state *xas) { void *entry = xas_start(xas); while (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); if (xas->xa_shift > node->shift) break; entry = xas_descend(xas, node); if (node->shift == 0) break; } return entry; } EXPORT_SYMBOL_GPL(xas_load); #define XA_RCU_FREE ((struct xarray *)1) static void xa_node_free(struct xa_node *node) { XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); node->array = XA_RCU_FREE; call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * xas_destroy() - Free any resources allocated during the XArray operation. * @xas: XArray operation state. * * Most users will not need to call this function; it is called for you * by xas_nomem(). */ void xas_destroy(struct xa_state *xas) { struct xa_node *next, *node = xas->xa_alloc; while (node) { XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); next = rcu_dereference_raw(node->parent); radix_tree_node_rcu_free(&node->rcu_head); xas->xa_alloc = node = next; } } /** * xas_nomem() - Allocate memory if needed. * @xas: XArray operation state. * @gfp: Memory allocation flags. * * If we need to add new nodes to the XArray, we try to allocate memory * with GFP_NOWAIT while holding the lock, which will usually succeed. * If it fails, @xas is flagged as needing memory to continue. The caller * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, * the caller should retry the operation. * * Forward progress is guaranteed as one node is allocated here and * stored in the xa_state where it will be found by xas_alloc(). More * nodes will likely be found in the slab allocator, but we do not tie * them up here. * * Return: true if memory was needed, and was successfully allocated. */ bool xas_nomem(struct xa_state *xas, gfp_t gfp) { if (xas->xa_node != XA_ERROR(-ENOMEM)) { xas_destroy(xas); return false; } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; } EXPORT_SYMBOL_GPL(xas_nomem); /* * __xas_nomem() - Drop locks and allocate memory if needed. * @xas: XArray operation state. * @gfp: Memory allocation flags. * * Internal variant of xas_nomem(). * * Return: true if memory was needed, and was successfully allocated. */ static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) __must_hold(xas->xa->xa_lock) { unsigned int lock_type = xa_lock_type(xas->xa); if (xas->xa_node != XA_ERROR(-ENOMEM)) { xas_destroy(xas); return false; } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; if (gfpflags_allow_blocking(gfp)) { xas_unlock_type(xas, lock_type); xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); xas_lock_type(xas, lock_type); } else { xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); } if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); xas->xa_node = XAS_RESTART; return true; } static void xas_update(struct xa_state *xas, struct xa_node *node) { if (xas->xa_update) xas->xa_update(node); else XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); } static void *xas_alloc(struct xa_state *xas, unsigned int shift) { struct xa_node *parent = xas->xa_node; struct xa_node *node = xas->xa_alloc; if (xas_invalid(xas)) return NULL; if (node) { xas->xa_alloc = NULL; } else { gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) { xas_set_err(xas, -ENOMEM); return NULL; } } if (parent) { node->offset = xas->xa_offset; parent->count++; XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); xas_update(xas, parent); } XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); node->shift = shift; node->count = 0; node->nr_values = 0; RCU_INIT_POINTER(node->parent, xas->xa_node); node->array = xas->xa; return node; } #ifdef CONFIG_XARRAY_MULTI /* Returns the number of indices covered by a given xa_state */ static unsigned long xas_size(const struct xa_state *xas) { return (xas->xa_sibs + 1UL) << xas->xa_shift; } #endif /* * Use this to calculate the maximum index that will need to be created * in order to add the entry described by @xas. Because we cannot store a * multi-index entry at index 0, the calculation is a little more complex * than you might expect. */ static unsigned long xas_max(struct xa_state *xas) { unsigned long max = xas->xa_index; #ifdef CONFIG_XARRAY_MULTI if (xas->xa_shift || xas->xa_sibs) { unsigned long mask = xas_size(xas) - 1; max |= mask; if (mask == max) max++; } #endif return max; } /* The maximum index that can be contained in the array without expanding it */ static unsigned long max_index(void *entry) { if (!xa_is_node(entry)) return 0; return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; } static void xas_shrink(struct xa_state *xas) { struct xarray *xa = xas->xa; struct xa_node *node = xas->xa_node; for (;;) { void *entry; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); if (node->count != 1) break; entry = xa_entry_locked(xa, node, 0); if (!entry) break; if (!xa_is_node(entry) && node->shift) break; if (xa_is_zero(entry) && xa_zero_busy(xa)) entry = NULL; xas->xa_node = XAS_BOUNDS; RCU_INIT_POINTER(xa->xa_head, entry); if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK)) xa_mark_clear(xa, XA_FREE_MARK); node->count = 0; node->nr_values = 0; if (!xa_is_node(entry)) RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); xas_update(xas, node); xa_node_free(node); if (!xa_is_node(entry)) break; node = xa_to_node(entry); node->parent = NULL; } } /* * xas_delete_node() - Attempt to delete an xa_node * @xas: Array operation state. * * Attempts to delete the @xas->xa_node. This will fail if xa->node has * a non-zero reference count. */ static void xas_delete_node(struct xa_state *xas) { struct xa_node *node = xas->xa_node; for (;;) { struct xa_node *parent; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); if (node->count) break; parent = xa_parent_locked(xas->xa, node); xas->xa_node = parent; xas->xa_offset = node->offset; xa_node_free(node); if (!parent) { xas->xa->xa_head = NULL; xas->xa_node = XAS_BOUNDS; return; } parent->slots[xas->xa_offset] = NULL; parent->count--; XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); node = parent; xas_update(xas, node); } if (!node->parent) xas_shrink(xas); } /** * xas_free_nodes() - Free this node and all nodes that it references * @xas: Array operation state. * @top: Node to free * * This node has been removed from the tree. We must now free it and all * of its subnodes. There may be RCU walkers with references into the tree, * so we must replace all entries with retry markers. */ static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) { unsigned int offset = 0; struct xa_node *node = top; for (;;) { void *entry = xa_entry_locked(xas->xa, node, offset); if (node->shift && xa_is_node(entry)) { node = xa_to_node(entry); offset = 0; continue; } if (entry) RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); offset++; while (offset == XA_CHUNK_SIZE) { struct xa_node *parent; parent = xa_parent_locked(xas->xa, node); offset = node->offset + 1; node->count = 0; node->nr_values = 0; xas_update(xas, node); xa_node_free(node); if (node == top) return; node = parent; } } } /* * xas_expand adds nodes to the head of the tree until it has reached * sufficient height to be able to contain @xas->xa_index */ static int xas_expand(struct xa_state *xas, void *head) { struct xarray *xa = xas->xa; struct xa_node *node = NULL; unsigned int shift = 0; unsigned long max = xas_max(xas); if (!head) { if (max == 0) return 0; while ((max >> shift) >= XA_CHUNK_SIZE) shift += XA_CHUNK_SHIFT; return shift + XA_CHUNK_SHIFT; } else if (xa_is_node(head)) { node = xa_to_node(head); shift = node->shift + XA_CHUNK_SHIFT; } xas->xa_node = NULL; while (max > max_index(head)) { xa_mark_t mark = 0; XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); node = xas_alloc(xas, shift); if (!node) return -ENOMEM; node->count = 1; if (xa_is_value(head)) node->nr_values = 1; RCU_INIT_POINTER(node->slots[0], head); /* Propagate the aggregated mark info to the new child */ for (;;) { if (xa_track_free(xa) && mark == XA_FREE_MARK) { node_mark_all(node, XA_FREE_MARK); if (!xa_marked(xa, XA_FREE_MARK)) { node_clear_mark(node, 0, XA_FREE_MARK); xa_mark_set(xa, XA_FREE_MARK); } } else if (xa_marked(xa, mark)) { node_set_mark(node, 0, mark); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } /* * Now that the new node is fully initialised, we can add * it to the tree */ if (xa_is_node(head)) { xa_to_node(head)->offset = 0; rcu_assign_pointer(xa_to_node(head)->parent, node); } head = xa_mk_node(node); rcu_assign_pointer(xa->xa_head, head); xas_update(xas, node); shift += XA_CHUNK_SHIFT; } xas->xa_node = node; return shift; } /* * xas_create() - Create a slot to store an entry in. * @xas: XArray operation state. * @allow_root: %true if we can store the entry in the root directly * * Most users will not need to call this function directly, as it is called * by xas_store(). It is useful for doing conditional store operations * (see the xa_cmpxchg() implementation for an example). * * Return: If the slot already existed, returns the contents of this slot. * If the slot was newly created, returns %NULL. If it failed to create the * slot, returns %NULL and indicates the error in @xas. */ static void *xas_create(struct xa_state *xas, bool allow_root) { struct xarray *xa = xas->xa; void *entry; void __rcu **slot; struct xa_node *node = xas->xa_node; int shift; unsigned int order = xas->xa_shift; if (xas_top(node)) { entry = xa_head_locked(xa); xas->xa_node = NULL; if (!entry && xa_zero_busy(xa)) entry = XA_ZERO_ENTRY; shift = xas_expand(xas, entry); if (shift < 0) return NULL; if (!shift && !allow_root) shift = XA_CHUNK_SHIFT; entry = xa_head_locked(xa); slot = &xa->xa_head; } else if (xas_error(xas)) { return NULL; } else if (node) { unsigned int offset = xas->xa_offset; shift = node->shift; entry = xa_entry_locked(xa, node, offset); slot = &node->slots[offset]; } else { shift = 0; entry = xa_head_locked(xa); slot = &xa->xa_head; } while (shift > order) { shift -= XA_CHUNK_SHIFT; if (!entry) { node = xas_alloc(xas, shift); if (!node) break; if (xa_track_free(xa)) node_mark_all(node, XA_FREE_MARK); rcu_assign_pointer(*slot, xa_mk_node(node)); } else if (xa_is_node(entry)) { node = xa_to_node(entry); } else { break; } entry = xas_descend(xas, node); slot = &node->slots[xas->xa_offset]; } return entry; } /** * xas_create_range() - Ensure that stores to this range will succeed * @xas: XArray operation state. * * Creates all of the slots in the range covered by @xas. Sets @xas to * create single-index entries and positions it at the beginning of the * range. This is for the benefit of users which have not yet been * converted to use multi-index entries. */ void xas_create_range(struct xa_state *xas) { unsigned long index = xas->xa_index; unsigned char shift = xas->xa_shift; unsigned char sibs = xas->xa_sibs; xas->xa_index |= ((sibs + 1UL) << shift) - 1; if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) xas->xa_offset |= sibs; xas->xa_shift = 0; xas->xa_sibs = 0; for (;;) { xas_create(xas, true); if (xas_error(xas)) goto restore; if (xas->xa_index <= (index | XA_CHUNK_MASK)) goto success; xas->xa_index -= XA_CHUNK_SIZE; for (;;) { struct xa_node *node = xas->xa_node; if (node->shift >= shift) break; xas->xa_node = xa_parent_locked(xas->xa, node); xas->xa_offset = node->offset - 1; if (node->offset != 0) break; } } restore: xas->xa_shift = shift; xas->xa_sibs = sibs; xas->xa_index = index; return; success: xas->xa_index = index; if (xas->xa_node) xas_set_offset(xas); } EXPORT_SYMBOL_GPL(xas_create_range); static void update_node(struct xa_state *xas, struct xa_node *node, int count, int values) { if (!node || (!count && !values)) return; node->count += count; node->nr_values += values; XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); xas_update(xas, node); if (count < 0) xas_delete_node(xas); } /** * xas_store() - Store this entry in the XArray. * @xas: XArray operation state. * @entry: New entry. * * If @xas is operating on a multi-index entry, the entry returned by this * function is essentially meaningless (it may be an internal entry or it * may be %NULL, even if there are non-NULL entries at some of the indices * covered by the range). This is not a problem for any current users, * and can be changed if needed. * * Return: The old entry at this index. */ void *xas_store(struct xa_state *xas, void *entry) { struct xa_node *node; void __rcu **slot = &xas->xa->xa_head; unsigned int offset, max; int count = 0; int values = 0; void *first, *next; bool value = xa_is_value(entry); if (entry) { bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry); first = xas_create(xas, allow_root); } else { first = xas_load(xas); } if (xas_invalid(xas)) return first; node = xas->xa_node; if (node && (xas->xa_shift < node->shift)) xas->xa_sibs = 0; if ((first == entry) && !xas->xa_sibs) return first; next = first; offset = xas->xa_offset; max = xas->xa_offset + xas->xa_sibs; if (node) { slot = &node->slots[offset]; if (xas->xa_sibs) xas_squash_marks(xas); } if (!entry) xas_init_marks(xas); for (;;) { /* * Must clear the marks before setting the entry to NULL, * otherwise xas_for_each_marked may find a NULL entry and * stop early. rcu_assign_pointer contains a release barrier * so the mark clearing will appear to happen before the * entry is set to NULL. */ rcu_assign_pointer(*slot, entry); if (xa_is_node(next) && (!node || node->shift)) xas_free_nodes(xas, xa_to_node(next)); if (!node) break; count += !next - !entry; values += !xa_is_value(first) - !value; if (entry) { if (offset == max) break; if (!xa_is_sibling(entry)) entry = xa_mk_sibling(xas->xa_offset); } else { if (offset == XA_CHUNK_MASK) break; } next = xa_entry_locked(xas->xa, node, ++offset); if (!xa_is_sibling(next)) { if (!entry && (offset > max)) break; first = next; } slot++; } update_node(xas, node, count, values); return first; } EXPORT_SYMBOL_GPL(xas_store); /** * xas_get_mark() - Returns the state of this mark. * @xas: XArray operation state. * @mark: Mark number. * * Return: true if the mark is set, false if the mark is clear or @xas * is in an error state. */ bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) { if (xas_invalid(xas)) return false; if (!xas->xa_node) return xa_marked(xas->xa, mark); return node_get_mark(xas->xa_node, xas->xa_offset, mark); } EXPORT_SYMBOL_GPL(xas_get_mark); /** * xas_set_mark() - Sets the mark on this entry and its parents. * @xas: XArray operation state. * @mark: Mark number. * * Sets the specified mark on this entry, and walks up the tree setting it * on all the ancestor entries. Does nothing if @xas has not been walked to * an entry, or is in an error state. */ void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) { struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset; if (xas_invalid(xas)) return; while (node) { if (node_set_mark(node, offset, mark)) return; offset = node->offset; node = xa_parent_locked(xas->xa, node); } if (!xa_marked(xas->xa, mark)) xa_mark_set(xas->xa, mark); } EXPORT_SYMBOL_GPL(xas_set_mark); /** * xas_clear_mark() - Clears the mark on this entry and its parents. * @xas: XArray operation state. * @mark: Mark number. * * Clears the specified mark on this entry, and walks back to the head * attempting to clear it on all the ancestor entries. Does nothing if * @xas has not been walked to an entry, or is in an error state. */ void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) { struct xa_node *node = xas->xa_node; unsigned int offset = xas->xa_offset; if (xas_invalid(xas)) return; while (node) { if (!node_clear_mark(node, offset, mark)) return; if (node_any_mark(node, mark)) return; offset = node->offset; node = xa_parent_locked(xas->xa, node); } if (xa_marked(xas->xa, mark)) xa_mark_clear(xas->xa, mark); } EXPORT_SYMBOL_GPL(xas_clear_mark); /** * xas_init_marks() - Initialise all marks for the entry * @xas: Array operations state. * * Initialise all marks for the entry specified by @xas. If we're tracking * free entries with a mark, we need to set it on all entries. All other * marks are cleared. * * This implementation is not as efficient as it could be; we may walk * up the tree multiple times. */ void xas_init_marks(const struct xa_state *xas) { xa_mark_t mark = 0; for (;;) { if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) xas_set_mark(xas, mark); else xas_clear_mark(xas, mark); if (mark == XA_MARK_MAX) break; mark_inc(mark); } } EXPORT_SYMBOL_GPL(xas_init_marks); #ifdef CONFIG_XARRAY_MULTI static unsigned int node_get_marks(struct xa_node *node, unsigned int offset) { unsigned int marks = 0; xa_mark_t mark = XA_MARK_0; for (;;) { if (node_get_mark(node, offset, mark)) marks |= 1 << (__force unsigned int)mark; if (mark == XA_MARK_MAX) break; mark_inc(mark); } return marks; } static inline void node_mark_slots(struct xa_node *node, unsigned int sibs, xa_mark_t mark) { int i; if (sibs == 0) node_mark_all(node, mark); else { for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1) node_set_mark(node, i, mark); } } static void node_set_marks(struct xa_node *node, unsigned int offset, struct xa_node *child, unsigned int sibs, unsigned int marks) { xa_mark_t mark = XA_MARK_0; for (;;) { if (marks & (1 << (__force unsigned int)mark)) { node_set_mark(node, offset, mark); if (child) node_mark_slots(child, sibs, mark); } if (mark == XA_MARK_MAX) break; mark_inc(mark); } } /** * xas_split_alloc() - Allocate memory for splitting an entry. * @xas: XArray operation state. * @entry: New entry which will be stored in the array. * @order: Current entry order. * @gfp: Memory allocation flags. * * This function should be called before calling xas_split(). * If necessary, it will allocate new nodes (and fill them with @entry) * to prepare for the upcoming split of an entry of @order size into * entries of the order stored in the @xas. * * Context: May sleep if @gfp flags permit. */ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; unsigned int mask = xas->xa_sibs; /* XXX: no support for splitting really large entries yet */ if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order)) goto nomem; if (xas->xa_shift + XA_CHUNK_SHIFT > order) return; do { unsigned int i; void *sibling = NULL; struct xa_node *node; node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) goto nomem; node->array = xas->xa; for (i = 0; i < XA_CHUNK_SIZE; i++) { if ((i & mask) == 0) { RCU_INIT_POINTER(node->slots[i], entry); sibling = xa_mk_sibling(i); } else { RCU_INIT_POINTER(node->slots[i], sibling); } } RCU_INIT_POINTER(node->parent, xas->xa_alloc); xas->xa_alloc = node; } while (sibs-- > 0); return; nomem: xas_destroy(xas); xas_set_err(xas, -ENOMEM); } EXPORT_SYMBOL_GPL(xas_split_alloc); /** * xas_split() - Split a multi-index entry into smaller entries. * @xas: XArray operation state. * @entry: New entry to store in the array. * @order: Current entry order. * * The size of the new entries is set in @xas. The value in @entry is * copied to all the replacement entries. * * Context: Any context. The caller should hold the xa_lock. */ void xas_split(struct xa_state *xas, void *entry, unsigned int order) { unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; unsigned int offset, marks; struct xa_node *node; void *curr = xas_load(xas); int values = 0; node = xas->xa_node; if (xas_top(node)) return; marks = node_get_marks(node, xas->xa_offset); offset = xas->xa_offset + sibs; do { if (xas->xa_shift < node->shift) { struct xa_node *child = xas->xa_alloc; xas->xa_alloc = rcu_dereference_raw(child->parent); child->shift = node->shift - XA_CHUNK_SHIFT; child->offset = offset; child->count = XA_CHUNK_SIZE; child->nr_values = xa_is_value(entry) ? XA_CHUNK_SIZE : 0; RCU_INIT_POINTER(child->parent, node); node_set_marks(node, offset, child, xas->xa_sibs, marks); rcu_assign_pointer(node->slots[offset], xa_mk_node(child)); if (xa_is_value(curr)) values--; xas_update(xas, child); } else { unsigned int canon = offset - xas->xa_sibs; node_set_marks(node, canon, NULL, 0, marks); rcu_assign_pointer(node->slots[canon], entry); while (offset > canon) rcu_assign_pointer(node->slots[offset--], xa_mk_sibling(canon)); values += (xa_is_value(entry) - xa_is_value(curr)) * (xas->xa_sibs + 1); } } while (offset-- > xas->xa_offset); node->nr_values += values; xas_update(xas, node); } EXPORT_SYMBOL_GPL(xas_split); #endif /** * xas_pause() - Pause a walk to drop a lock. * @xas: XArray operation state. * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @xas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call xas_pause(), the xa_for_each() * iterator may be more appropriate. * * Note that xas_pause() only works for forward iteration. If a user needs * to pause a reverse iteration, we will need a xas_pause_rev(). */ void xas_pause(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (xas_invalid(xas)) return; xas->xa_node = XAS_RESTART; if (node) { unsigned long offset = xas->xa_offset; while (++offset < XA_CHUNK_SIZE) { if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) break; } xas->xa_index += (offset - xas->xa_offset) << node->shift; if (xas->xa_index == 0) xas->xa_node = XAS_BOUNDS; } else { xas->xa_index++; } } EXPORT_SYMBOL_GPL(xas_pause); /* * __xas_prev() - Find the previous entry in the XArray. * @xas: XArray operation state. * * Helper function for xas_prev() which handles all the complex cases * out of line. */ void *__xas_prev(struct xa_state *xas) { void *entry; if (!xas_frozen(xas->xa_node)) xas->xa_index--; if (!xas->xa_node) return set_bounds(xas); if (xas_not_node(xas->xa_node)) return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset--; while (xas->xa_offset == 255) { xas->xa_offset = xas->xa_node->offset - 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) return set_bounds(xas); } for (;;) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } } EXPORT_SYMBOL_GPL(__xas_prev); /* * __xas_next() - Find the next entry in the XArray. * @xas: XArray operation state. * * Helper function for xas_next() which handles all the complex cases * out of line. */ void *__xas_next(struct xa_state *xas) { void *entry; if (!xas_frozen(xas->xa_node)) xas->xa_index++; if (!xas->xa_node) return set_bounds(xas); if (xas_not_node(xas->xa_node)) return xas_load(xas); if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) xas->xa_offset++; while (xas->xa_offset == XA_CHUNK_SIZE) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) return set_bounds(xas); } for (;;) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } } EXPORT_SYMBOL_GPL(__xas_next); /** * xas_find() - Find the next present entry in the XArray. * @xas: XArray operation state. * @max: Highest index to return. * * If the @xas has not yet been walked to an entry, return the entry * which has an index >= xas.xa_index. If it has been walked, the entry * currently being pointed at has been processed, and so we move to the * next entry. * * If no entry is found and the array is smaller than @max, the iterator * is set to the smallest index not yet in the array. This allows @xas * to be immediately passed to xas_store(). * * Return: The entry, if found, otherwise %NULL. */ void *xas_find(struct xa_state *xas, unsigned long max) { void *entry; if (xas_error(xas) || xas->xa_node == XAS_BOUNDS) return NULL; if (xas->xa_index > max) return set_bounds(xas); if (!xas->xa_node) { xas->xa_index = 1; return set_bounds(xas); } else if (xas->xa_node == XAS_RESTART) { entry = xas_load(xas); if (entry || xas_not_node(xas->xa_node)) return entry; } else if (!xas->xa_node->shift && xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1; } xas_next_offset(xas); while (xas->xa_node && (xas->xa_index <= max)) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); continue; } entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (xa_is_node(entry)) { xas->xa_node = xa_to_node(entry); xas->xa_offset = 0; continue; } if (entry && !xa_is_sibling(entry)) return entry; xas_next_offset(xas); } if (!xas->xa_node) xas->xa_node = XAS_BOUNDS; return NULL; } EXPORT_SYMBOL_GPL(xas_find); /** * xas_find_marked() - Find the next marked entry in the XArray. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark number to search for. * * If the @xas has not yet been walked to an entry, return the marked entry * which has an index >= xas.xa_index. If it has been walked, the entry * currently being pointed at has been processed, and so we return the * first marked entry with an index > xas.xa_index. * * If no marked entry is found and the array is smaller than @max, @xas is * set to the bounds state and xas->xa_index is set to the smallest index * not yet in the array. This allows @xas to be immediately passed to * xas_store(). * * If no entry is found before @max is reached, @xas is set to the restart * state. * * Return: The entry, if found, otherwise %NULL. */ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { bool advance = true; unsigned int offset; void *entry; if (xas_error(xas)) return NULL; if (xas->xa_index > max) goto max; if (!xas->xa_node) { xas->xa_index = 1; goto out; } else if (xas_top(xas->xa_node)) { advance = false; entry = xa_head(xas->xa); xas->xa_node = NULL; if (xas->xa_index > max_index(entry)) goto out; if (!xa_is_node(entry)) { if (xa_marked(xas->xa, mark)) return entry; xas->xa_index = 1; goto out; } xas->xa_node = xa_to_node(entry); xas->xa_offset = xas->xa_index >> xas->xa_node->shift; } while (xas->xa_index <= max) { if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { xas->xa_offset = xas->xa_node->offset + 1; xas->xa_node = xa_parent(xas->xa, xas->xa_node); if (!xas->xa_node) break; advance = false; continue; } if (!advance) { entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (xa_is_sibling(entry)) { xas->xa_offset = xa_to_sibling(entry); xas_move_index(xas, xas->xa_offset); } } offset = xas_find_chunk(xas, advance, mark); if (offset > xas->xa_offset) { advance = false; xas_move_index(xas, offset); /* Mind the wrap */ if ((xas->xa_index - 1) >= max) goto max; xas->xa_offset = offset; if (offset == XA_CHUNK_SIZE) continue; } entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK)) continue; if (!xa_is_node(entry)) return entry; xas->xa_node = xa_to_node(entry); xas_set_offset(xas); } out: if (xas->xa_index > max) goto max; return set_bounds(xas); max: xas->xa_node = XAS_RESTART; return NULL; } EXPORT_SYMBOL_GPL(xas_find_marked); /** * xas_find_conflict() - Find the next present entry in a range. * @xas: XArray operation state. * * The @xas describes both a range and a position within that range. * * Context: Any context. Expects xa_lock to be held. * Return: The next entry in the range covered by @xas or %NULL. */ void *xas_find_conflict(struct xa_state *xas) { void *curr; if (xas_error(xas)) return NULL; if (!xas->xa_node) return NULL; if (xas_top(xas->xa_node)) { curr = xas_start(xas); if (!curr) return NULL; while (xa_is_node(curr)) { struct xa_node *node = xa_to_node(curr); curr = xas_descend(xas, node); } if (curr) return curr; } if (xas->xa_node->shift > xas->xa_shift) return NULL; for (;;) { if (xas->xa_node->shift == xas->xa_shift) { if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs) break; } else if (xas->xa_offset == XA_CHUNK_MASK) { xas->xa_offset = xas->xa_node->offset; xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node); if (!xas->xa_node) break; continue; } curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset); if (xa_is_sibling(curr)) continue; while (xa_is_node(curr)) { xas->xa_node = xa_to_node(curr); xas->xa_offset = 0; curr = xa_entry_locked(xas->xa, xas->xa_node, 0); } if (curr) return curr; } xas->xa_offset -= xas->xa_sibs; return NULL; } EXPORT_SYMBOL_GPL(xas_find_conflict); /** * xa_load() - Load an entry from an XArray. * @xa: XArray. * @index: index into array. * * Context: Any context. Takes and releases the RCU lock. * Return: The entry at @index in @xa. */ void *xa_load(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); void *entry; rcu_read_lock(); do { entry = xas_load(&xas); if (xa_is_zero(entry)) entry = NULL; } while (xas_retry(&xas, entry)); rcu_read_unlock(); return entry; } EXPORT_SYMBOL(xa_load); static void *xas_result(struct xa_state *xas, void *curr) { if (xa_is_zero(curr)) return NULL; if (xas_error(xas)) curr = xas->xa_node; return curr; } /** * __xa_erase() - Erase this entry from the XArray while locked. * @xa: XArray. * @index: Index into array. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Expects xa_lock to be held on entry. * Return: The entry which used to be at this index. */ void *__xa_erase(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); return xas_result(&xas, xas_store(&xas, NULL)); } EXPORT_SYMBOL(__xa_erase); /** * xa_erase() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock. * Return: The entry which used to be at this index. */ void *xa_erase(struct xarray *xa, unsigned long index) { void *entry; xa_lock(xa); entry = __xa_erase(xa, index); xa_unlock(xa); return entry; } EXPORT_SYMBOL(xa_erase); /** * __xa_store() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: The old entry at this index or xa_err() if an error happened. */ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return XA_ERROR(-EINVAL); if (xa_track_free(xa) && !entry) entry = XA_ZERO_ENTRY; do { curr = xas_store(&xas, entry); if (xa_track_free(xa)) xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, curr); } EXPORT_SYMBOL(__xa_store); /** * xa_store() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * After this function returns, loads from this index will return @entry. * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation * failed. */ void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; xa_lock(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock(xa); return curr; } EXPORT_SYMBOL(xa_store); /** * __xa_cmpxchg() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New entry. * @gfp: Memory allocation flags. * * You must already be holding the xa_lock when calling this function. * It will drop the lock if needed to allocate memory, and then reacquire * it afterwards. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: The old entry at this index or xa_err() if an error happened. */ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return XA_ERROR(-EINVAL); do { curr = xas_load(&xas); if (curr == old) { xas_store(&xas, entry); if (xa_track_free(xa) && entry && !curr) xas_clear_mark(&xas, XA_FREE_MARK); } } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, curr); } EXPORT_SYMBOL(__xa_cmpxchg); /** * __xa_insert() - Store this entry in the XArray if no entry is present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { XA_STATE(xas, xa, index); void *curr; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (!entry) entry = XA_ZERO_ENTRY; do { curr = xas_load(&xas); if (!curr) { xas_store(&xas, entry); if (xa_track_free(xa)) xas_clear_mark(&xas, XA_FREE_MARK); } else { xas_set_err(&xas, -EBUSY); } } while (__xas_nomem(&xas, gfp)); return xas_error(&xas); } EXPORT_SYMBOL(__xa_insert); #ifdef CONFIG_XARRAY_MULTI static void xas_set_range(struct xa_state *xas, unsigned long first, unsigned long last) { unsigned int shift = 0; unsigned long sibs = last - first; unsigned int offset = XA_CHUNK_MASK; xas_set(xas, first); while ((first & XA_CHUNK_MASK) == 0) { if (sibs < XA_CHUNK_MASK) break; if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK)) break; shift += XA_CHUNK_SHIFT; if (offset == XA_CHUNK_MASK) offset = sibs & XA_CHUNK_MASK; sibs >>= XA_CHUNK_SHIFT; first >>= XA_CHUNK_SHIFT; } offset = first & XA_CHUNK_MASK; if (offset + sibs > XA_CHUNK_MASK) sibs = XA_CHUNK_MASK - offset; if ((((first + sibs + 1) << shift) - 1) > last) sibs -= 1; xas->xa_shift = shift; xas->xa_sibs = sibs; } /** * xa_store_range() - Store this entry at a range of indices in the XArray. * @xa: XArray. * @first: First index to affect. * @last: Last index to affect. * @entry: New entry. * @gfp: Memory allocation flags. * * After this function returns, loads from any index between @first and @last, * inclusive will return @entry. * Storing into an existing multi-index entry updates the entry of every index. * The marks associated with @index are unaffected unless @entry is %NULL. * * Context: Process context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in * an XArray, or xa_err(-ENOMEM) if memory allocation failed. */ void *xa_store_range(struct xarray *xa, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { XA_STATE(xas, xa, 0); if (WARN_ON_ONCE(xa_is_internal(entry))) return XA_ERROR(-EINVAL); if (last < first) return XA_ERROR(-EINVAL); do { xas_lock(&xas); if (entry) { unsigned int order = BITS_PER_LONG; if (last + 1) order = __ffs(last + 1); xas_set_order(&xas, last, order); xas_create(&xas, true); if (xas_error(&xas)) goto unlock; } do { xas_set_range(&xas, first, last); xas_store(&xas, entry); if (xas_error(&xas)) goto unlock; first += xas_size(&xas); } while (first <= last); unlock: xas_unlock(&xas); } while (xas_nomem(&xas, gfp)); return xas_result(&xas, NULL); } EXPORT_SYMBOL(xa_store_range); /** * xas_get_order() - Get the order of an entry. * @xas: XArray operation state. * * Called after xas_load, the xas should not be in an error state. * * Return: A number between 0 and 63 indicating the order of the entry. */ int xas_get_order(struct xa_state *xas) { int order = 0; if (!xas->xa_node) return 0; for (;;) { unsigned int slot = xas->xa_offset + (1 << order); if (slot >= XA_CHUNK_SIZE) break; if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot))) break; order++; } order += xas->xa_node->shift; return order; } EXPORT_SYMBOL_GPL(xas_get_order); /** * xa_get_order() - Get the order of an entry. * @xa: XArray. * @index: Index of the entry. * * Return: A number between 0 and 63 indicating the order of the entry. */ int xa_get_order(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); int order = 0; void *entry; rcu_read_lock(); entry = xas_load(&xas); if (entry) order = xas_get_order(&xas); rcu_read_unlock(); return order; } EXPORT_SYMBOL(xa_get_order); #endif /* CONFIG_XARRAY_MULTI */ /** * __xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @limit: Range for allocated ID. * @entry: New entry. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ int __xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { XA_STATE(xas, xa, 0); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (WARN_ON_ONCE(!xa_track_free(xa))) return -EINVAL; if (!entry) entry = XA_ZERO_ENTRY; do { xas.xa_index = limit.min; xas_find_marked(&xas, limit.max, XA_FREE_MARK); if (xas.xa_node == XAS_RESTART) xas_set_err(&xas, -EBUSY); else *id = xas.xa_index; xas_store(&xas, entry); xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_error(&xas); } EXPORT_SYMBOL(__xa_alloc); /** * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { u32 min = limit.min; int ret; limit.min = max(min, *next); ret = __xa_alloc(xa, id, entry, limit, gfp); if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) { xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && limit.min > min) { limit.min = min; ret = __xa_alloc(xa, id, entry, limit, gfp); if (ret == 0) ret = 1; } if (ret >= 0) { *next = *id + 1; if (*next == 0) xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED; } return ret; } EXPORT_SYMBOL(__xa_alloc_cyclic); /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Any context. Expects xa_lock to be held on entry. */ void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry = xas_load(&xas); if (entry) xas_set_mark(&xas, mark); } EXPORT_SYMBOL(__xa_set_mark); /** * __xa_clear_mark() - Clear this mark on this entry while locked. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Context: Any context. Expects xa_lock to be held on entry. */ void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry = xas_load(&xas); if (entry) xas_clear_mark(&xas, mark); } EXPORT_SYMBOL(__xa_clear_mark); /** * xa_get_mark() - Inquire whether this mark is set on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * This function uses the RCU read lock, so the result may be out of date * by the time it returns. If you need the result to be stable, use a lock. * * Context: Any context. Takes and releases the RCU lock. * Return: True if the entry at @index has this mark set, false if it doesn't. */ bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { XA_STATE(xas, xa, index); void *entry; rcu_read_lock(); entry = xas_start(&xas); while (xas_get_mark(&xas, mark)) { if (!xa_is_node(entry)) goto found; entry = xas_descend(&xas, xa_to_node(entry)); } rcu_read_unlock(); return false; found: rcu_read_unlock(); return true; } EXPORT_SYMBOL(xa_get_mark); /** * xa_set_mark() - Set this mark on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Attempting to set a mark on a %NULL entry does not succeed. * * Context: Process context. Takes and releases the xa_lock. */ void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { xa_lock(xa); __xa_set_mark(xa, index, mark); xa_unlock(xa); } EXPORT_SYMBOL(xa_set_mark); /** * xa_clear_mark() - Clear this mark on this entry. * @xa: XArray. * @index: Index of entry. * @mark: Mark number. * * Clearing a mark always succeeds. * * Context: Process context. Takes and releases the xa_lock. */ void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) { xa_lock(xa); __xa_clear_mark(xa, index, mark); xa_unlock(xa); } EXPORT_SYMBOL(xa_clear_mark); /** * xa_find() - Search the XArray for an entry. * @xa: XArray. * @indexp: Pointer to an index. * @max: Maximum index to search to. * @filter: Selection criterion. * * Finds the entry in @xa which matches the @filter, and has the lowest * index that is at least @indexp and no more than @max. * If an entry is found, @indexp is updated to be the index of the entry. * This function is protected by the RCU read lock, so it may not find * entries which are being simultaneously added. It will not return an * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). * * Context: Any context. Takes and releases the RCU lock. * Return: The entry, if found, otherwise %NULL. */ void *xa_find(struct xarray *xa, unsigned long *indexp, unsigned long max, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) *indexp = xas.xa_index; return entry; } EXPORT_SYMBOL(xa_find); static bool xas_sibling(struct xa_state *xas) { struct xa_node *node = xas->xa_node; unsigned long mask; if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node) return false; mask = (XA_CHUNK_SIZE << node->shift) - 1; return (xas->xa_index & mask) > ((unsigned long)xas->xa_offset << node->shift); } /** * xa_find_after() - Search the XArray for a present entry. * @xa: XArray. * @indexp: Pointer to an index. * @max: Maximum index to search to. * @filter: Selection criterion. * * Finds the entry in @xa which matches the @filter and has the lowest * index that is above @indexp and no more than @max. * If an entry is found, @indexp is updated to be the index of the entry. * This function is protected by the RCU read lock, so it may miss entries * which are being simultaneously added. It will not return an * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). * * Context: Any context. Takes and releases the RCU lock. * Return: The pointer, if found, otherwise %NULL. */ void *xa_find_after(struct xarray *xa, unsigned long *indexp, unsigned long max, xa_mark_t filter) { XA_STATE(xas, xa, *indexp + 1); void *entry; if (xas.xa_index == 0) return NULL; rcu_read_lock(); for (;;) { if ((__force unsigned int)filter < XA_MAX_MARKS) entry = xas_find_marked(&xas, max, filter); else entry = xas_find(&xas, max); if (xas_invalid(&xas)) break; if (xas_sibling(&xas)) continue; if (!xas_retry(&xas, entry)) break; } rcu_read_unlock(); if (entry) *indexp = xas.xa_index; return entry; } EXPORT_SYMBOL(xa_find_after); static unsigned int xas_extract_present(struct xa_state *xas, void **dst, unsigned long max, unsigned int n) { void *entry; unsigned int i = 0; rcu_read_lock(); xas_for_each(xas, entry, max) { if (xas_retry(xas, entry)) continue; dst[i++] = entry; if (i == n) break; } rcu_read_unlock(); return i; } static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, unsigned long max, unsigned int n, xa_mark_t mark) { void *entry; unsigned int i = 0; rcu_read_lock(); xas_for_each_marked(xas, entry, max, mark) { if (xas_retry(xas, entry)) continue; dst[i++] = entry; if (i == n) break; } rcu_read_unlock(); return i; } /** * xa_extract() - Copy selected entries from the XArray into a normal array. * @xa: The source XArray to copy from. * @dst: The buffer to copy entries into. * @start: The first index in the XArray eligible to be selected. * @max: The last index in the XArray eligible to be selected. * @n: The maximum number of entries to copy. * @filter: Selection criterion. * * Copies up to @n entries that match @filter from the XArray. The * copied entries will have indices between @start and @max, inclusive. * * The @filter may be an XArray mark value, in which case entries which are * marked with that mark will be copied. It may also be %XA_PRESENT, in * which case all entries which are not %NULL will be copied. * * The entries returned may not represent a snapshot of the XArray at a * moment in time. For example, if another thread stores to index 5, then * index 10, calling xa_extract() may return the old contents of index 5 * and the new contents of index 10. Indices not modified while this * function is running will not be skipped. * * If you need stronger guarantees, holding the xa_lock across calls to this * function will prevent concurrent modification. * * Context: Any context. Takes and releases the RCU lock. * Return: The number of entries copied. */ unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t filter) { XA_STATE(xas, xa, start); if (!n) return 0; if ((__force unsigned int)filter < XA_MAX_MARKS) return xas_extract_marked(&xas, dst, max, n, filter); return xas_extract_present(&xas, dst, max, n); } EXPORT_SYMBOL(xa_extract); /** * xa_delete_node() - Private interface for workingset code. * @node: Node to be removed from the tree. * @update: Function to call to update ancestor nodes. * * Context: xa_lock must be held on entry and will not be released. */ void xa_delete_node(struct xa_node *node, xa_update_node_t update) { struct xa_state xas = { .xa = node->array, .xa_index = (unsigned long)node->offset << (node->shift + XA_CHUNK_SHIFT), .xa_shift = node->shift + XA_CHUNK_SHIFT, .xa_offset = node->offset, .xa_node = xa_parent_locked(node->array, node), .xa_update = update, }; xas_store(&xas, NULL); } EXPORT_SYMBOL_GPL(xa_delete_node); /* For the benefit of the test suite */ /** * xa_destroy() - Free all internal data structures. * @xa: XArray. * * After calling this function, the XArray is empty and has freed all memory * allocated for its internal data structures. You are responsible for * freeing the objects referenced by the XArray. * * Context: Any context. Takes and releases the xa_lock, interrupt-safe. */ void xa_destroy(struct xarray *xa) { XA_STATE(xas, xa, 0); unsigned long flags; void *entry; xas.xa_node = NULL; xas_lock_irqsave(&xas, flags); entry = xa_head_locked(xa); RCU_INIT_POINTER(xa->xa_head, NULL); xas_init_marks(&xas); if (xa_zero_busy(xa)) xa_mark_clear(xa, XA_FREE_MARK); /* lockdep checks we're still holding the lock in xas_free_nodes() */ if (xa_is_node(entry)) xas_free_nodes(&xas, xa_to_node(entry)); xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(xa_destroy); #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { unsigned i, j; if (!node) return; if ((unsigned long)node & 3) { pr_cont("node %px\n", node); return; } pr_cont("node %px %s %d parent %px shift %d count %d values %d " "array %px list %px %px marks", node, node->parent ? "offset" : "max", node->offset, node->parent, node->shift, node->count, node->nr_values, node->array, node->private_list.prev, node->private_list.next); for (i = 0; i < XA_MAX_MARKS; i++) for (j = 0; j < XA_MARK_LONGS; j++) pr_cont(" %lx", node->marks[i][j]); pr_cont("\n"); } void xa_dump_index(unsigned long index, unsigned int shift) { if (!shift) pr_info("%lu: ", index); else if (shift >= BITS_PER_LONG) pr_info("0-%lu: ", ~0UL); else pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); } void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) { if (!entry) return; xa_dump_index(index, shift); if (xa_is_node(entry)) { if (shift == 0) { pr_cont("%px\n", entry); } else { unsigned long i; struct xa_node *node = xa_to_node(entry); xa_dump_node(node); for (i = 0; i < XA_CHUNK_SIZE; i++) xa_dump_entry(node->slots[i], index + (i << node->shift), node->shift); } } else if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (!xa_is_internal(entry)) pr_cont("%px\n", entry); else if (xa_is_retry(entry)) pr_cont("retry (%ld)\n", xa_to_internal(entry)); else if (xa_is_sibling(entry)) pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else pr_cont("UNKNOWN ENTRY (%px)\n", entry); } void xa_dump(const struct xarray *xa) { void *entry = xa->xa_head; unsigned int shift = 0; pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, xa->xa_flags, xa_marked(xa, XA_MARK_0), xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); if (xa_is_node(entry)) shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; xa_dump_entry(entry, 0, shift); } #endif |
| 2 2 1 1 91 91 11 11 11 11 11 11 11 11 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0-only /* * lib/hexdump.c */ #include <linux/types.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/minmax.h> #include <linux/export.h> #include <asm/unaligned.h> const char hex_asc[] = "0123456789abcdef"; EXPORT_SYMBOL(hex_asc); const char hex_asc_upper[] = "0123456789ABCDEF"; EXPORT_SYMBOL(hex_asc_upper); /** * hex_to_bin - convert a hex digit to its real value * @ch: ascii character represents hex digit * * hex_to_bin() converts one hex digit to its actual value or -1 in case of bad * input. * * This function is used to load cryptographic keys, so it is coded in such a * way that there are no conditions or memory accesses that depend on data. * * Explanation of the logic: * (ch - '9' - 1) is negative if ch <= '9' * ('0' - 1 - ch) is negative if ch >= '0' * we "and" these two values, so the result is negative if ch is in the range * '0' ... '9' * we are only interested in the sign, so we do a shift ">> 8"; note that right * shift of a negative value is implementation-defined, so we cast the * value to (unsigned) before the shift --- we have 0xffffff if ch is in * the range '0' ... '9', 0 otherwise * we "and" this value with (ch - '0' + 1) --- we have a value 1 ... 10 if ch is * in the range '0' ... '9', 0 otherwise * we add this value to -1 --- we have a value 0 ... 9 if ch is in the range '0' * ... '9', -1 otherwise * the next line is similar to the previous one, but we need to decode both * uppercase and lowercase letters, so we use (ch & 0xdf), which converts * lowercase to uppercase */ int hex_to_bin(unsigned char ch) { unsigned char cu = ch & 0xdf; return -1 + ((ch - '0' + 1) & (unsigned)((ch - '9' - 1) & ('0' - 1 - ch)) >> 8) + ((cu - 'A' + 11) & (unsigned)((cu - 'F' - 1) & ('A' - 1 - cu)) >> 8); } EXPORT_SYMBOL(hex_to_bin); /** * hex2bin - convert an ascii hexadecimal string to its binary representation * @dst: binary result * @src: ascii hexadecimal string * @count: result length * * Return 0 on success, -EINVAL in case of bad input. */ int hex2bin(u8 *dst, const char *src, size_t count) { while (count--) { int hi, lo; hi = hex_to_bin(*src++); if (unlikely(hi < 0)) return -EINVAL; lo = hex_to_bin(*src++); if (unlikely(lo < 0)) return -EINVAL; *dst++ = (hi << 4) | lo; } return 0; } EXPORT_SYMBOL(hex2bin); /** * bin2hex - convert binary data to an ascii hexadecimal string * @dst: ascii hexadecimal result * @src: binary data * @count: binary data length */ char *bin2hex(char *dst, const void *src, size_t count) { const unsigned char *_src = src; while (count--) dst = hex_byte_pack(dst, *_src++); return dst; } EXPORT_SYMBOL(bin2hex); /** * hex_dump_to_buffer - convert a blob of data to "hex ASCII" in memory * @buf: data blob to dump * @len: number of bytes in the @buf * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @linebuf: where to put the converted data * @linebuflen: total size of @linebuf, including space for terminating NUL * @ascii: include ASCII after the hex output * * hex_dump_to_buffer() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * * Given a buffer of u8 data, hex_dump_to_buffer() converts the input data * to a hex + ASCII dump at the supplied memory location. * The converted output is always NUL-terminated. * * E.g.: * hex_dump_to_buffer(frame->data, frame->len, 16, 1, * linebuf, sizeof(linebuf), true); * * example output buffer: * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * * Return: * The amount of bytes placed in the buffer without terminating NUL. If the * output was truncated, then the return value is the number of bytes * (excluding the terminating NUL) which would have been written to the final * string if enough space had been available. */ int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, char *linebuf, size_t linebuflen, bool ascii) { const u8 *ptr = buf; int ngroups; u8 ch; int j, lx = 0; int ascii_column; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; if (len > rowsize) /* limit to one line at a time */ len = rowsize; if (!is_power_of_2(groupsize) || groupsize > 8) groupsize = 1; if ((len % groupsize) != 0) /* no mixed size output */ groupsize = 1; ngroups = len / groupsize; ascii_column = rowsize * 2 + rowsize / groupsize + 1; if (!linebuflen) goto overflow1; if (!len) goto nil; if (groupsize == 8) { const u64 *ptr8 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%16.16llx", j ? " " : "", get_unaligned(ptr8 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 4) { const u32 *ptr4 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%8.8x", j ? " " : "", get_unaligned(ptr4 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else if (groupsize == 2) { const u16 *ptr2 = buf; for (j = 0; j < ngroups; j++) { ret = snprintf(linebuf + lx, linebuflen - lx, "%s%4.4x", j ? " " : "", get_unaligned(ptr2 + j)); if (ret >= linebuflen - lx) goto overflow1; lx += ret; } } else { for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = hex_asc_lo(ch); if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } if (j) lx--; } if (!ascii) goto nil; while (lx < ascii_column) { if (linebuflen < lx + 2) goto overflow2; linebuf[lx++] = ' '; } for (j = 0; j < len; j++) { if (linebuflen < lx + 2) goto overflow2; ch = ptr[j]; linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; } nil: linebuf[lx] = '\0'; return lx; overflow2: linebuf[lx++] = '\0'; overflow1: return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; } EXPORT_SYMBOL(hex_dump_to_buffer); #ifdef CONFIG_PRINTK /** * print_hex_dump - print a text hex dump to syslog for a binary blob of data * @level: kernel log level (e.g. KERN_DEBUG) * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @buf: data blob to dump * @len: number of bytes in the @buf * @ascii: include ASCII after the hex output * * Given a buffer of u8 data, print_hex_dump() prints a hex + ASCII dump * to the kernel log at the specified kernel log level, with an optional * leading prefix. * * print_hex_dump() works on one "line" of output at a time, i.e., * 16 or 32 bytes of input data converted to hex + ASCII output. * print_hex_dump() iterates over the entire input @buf, breaking it into * "line size" chunks to format and print. * * E.g.: * print_hex_dump(KERN_DEBUG, "raw data: ", DUMP_PREFIX_ADDRESS, * 16, 1, frame->data, frame->len, true); * * Example output using %DUMP_PREFIX_OFFSET and 1-byte mode: * 0009ab42: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO * Example output using %DUMP_PREFIX_ADDRESS and 4-byte mode: * ffffffff88089af0: 73727170 77767574 7b7a7978 7f7e7d7c pqrstuvwxyz{|}~. */ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; unsigned char linebuf[32 * 3 + 2 + 32 + 1]; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len; i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, linebuf, sizeof(linebuf), ascii); switch (prefix_type) { case DUMP_PREFIX_ADDRESS: printk("%s%s%p: %s\n", level, prefix_str, ptr + i, linebuf); break; case DUMP_PREFIX_OFFSET: printk("%s%s%.8x: %s\n", level, prefix_str, i, linebuf); break; default: printk("%s%s%s\n", level, prefix_str, linebuf); break; } } } EXPORT_SYMBOL(print_hex_dump); #endif /* defined(CONFIG_PRINTK) */ |
| 19 15 6 1 23 23 19 4 19 8 2 1 10 2 5 1 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * dir.c */ /* * This file implements code to read directories from disk. * * See namei.c for a description of directory organisation on disk. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" static const unsigned char squashfs_filetype_table[] = { DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK }; /* * Lookup offset (f_pos) in the directory index, returning the * metadata block containing it. * * If we get an error reading the index then return the part of the index * (if any) we have managed to read - the index isn't essential, just * quicker. */ static int get_dir_index_using_offset(struct super_block *sb, u64 *next_block, int *next_offset, u64 index_start, int index_offset, int i_count, u64 f_pos) { struct squashfs_sb_info *msblk = sb->s_fs_info; int err, i, index, length = 0; unsigned int size; struct squashfs_dir_index dir_index; TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n", i_count, f_pos); /* * Translate from external f_pos to the internal f_pos. This * is offset by 3 because we invent "." and ".." entries which are * not actually stored in the directory. */ if (f_pos <= 3) return f_pos; f_pos -= 3; for (i = 0; i < i_count; i++) { err = squashfs_read_metadata(sb, &dir_index, &index_start, &index_offset, sizeof(dir_index)); if (err < 0) break; index = le32_to_cpu(dir_index.index); if (index > f_pos) /* * Found the index we're looking for. */ break; size = le32_to_cpu(dir_index.size) + 1; /* size should never be larger than SQUASHFS_NAME_LEN */ if (size > SQUASHFS_NAME_LEN) break; err = squashfs_read_metadata(sb, NULL, &index_start, &index_offset, size); if (err < 0) break; length = index; *next_block = le32_to_cpu(dir_index.start_block) + msblk->directory_table; } *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; /* * Translate back from internal f_pos to external f_pos. */ return length + 3; } static int squashfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; u64 block = squashfs_i(inode)->start + msblk->directory_table; int offset = squashfs_i(inode)->offset, length, err; unsigned int inode_number, dir_count, size, type; struct squashfs_dir_header dirh; struct squashfs_dir_entry *dire; TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset); dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); if (dire == NULL) { ERROR("Failed to allocate squashfs_dir_entry\n"); goto finish; } /* * Return "." and ".." entries as the first two filenames in the * directory. To maximise compression these two entries are not * stored in the directory, and so we invent them here. * * It also means that the external f_pos is offset by 3 from the * on-disk directory f_pos. */ while (ctx->pos < 3) { char *name; int i_ino; if (ctx->pos == 0) { name = "."; size = 1; i_ino = inode->i_ino; } else { name = ".."; size = 2; i_ino = squashfs_i(inode)->parent; } if (!dir_emit(ctx, name, size, i_ino, squashfs_filetype_table[1])) goto finish; ctx->pos += size; } length = get_dir_index_using_offset(inode->i_sb, &block, &offset, squashfs_i(inode)->dir_idx_start, squashfs_i(inode)->dir_idx_offset, squashfs_i(inode)->dir_idx_cnt, ctx->pos); while (length < i_size_read(inode)) { /* * Read directory header */ err = squashfs_read_metadata(inode->i_sb, &dirh, &block, &offset, sizeof(dirh)); if (err < 0) goto failed_read; length += sizeof(dirh); dir_count = le32_to_cpu(dirh.count) + 1; if (dir_count > SQUASHFS_DIR_COUNT) goto failed_read; while (dir_count--) { /* * Read directory entry. */ err = squashfs_read_metadata(inode->i_sb, dire, &block, &offset, sizeof(*dire)); if (err < 0) goto failed_read; size = le16_to_cpu(dire->size) + 1; /* size should never be larger than SQUASHFS_NAME_LEN */ if (size > SQUASHFS_NAME_LEN) goto failed_read; err = squashfs_read_metadata(inode->i_sb, dire->name, &block, &offset, size); if (err < 0) goto failed_read; length += sizeof(*dire) + size; if (ctx->pos >= length) continue; dire->name[size] = '\0'; inode_number = le32_to_cpu(dirh.inode_number) + ((short) le16_to_cpu(dire->inode_number)); type = le16_to_cpu(dire->type); if (type > SQUASHFS_MAX_DIR_TYPE) goto failed_read; if (!dir_emit(ctx, dire->name, size, inode_number, squashfs_filetype_table[type])) goto finish; ctx->pos = length; } } finish: kfree(dire); return 0; failed_read: ERROR("Unable to read directory block [%llx:%x]\n", block, offset); kfree(dire); return 0; } const struct file_operations squashfs_dir_ops = { .read = generic_read_dir, .iterate_shared = squashfs_readdir, .llseek = generic_file_llseek, }; |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 | /* Copyright (c) 2018, Mellanox Technologies All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <crypto/aead.h> #include <linux/highmem.h> #include <linux/module.h> #include <linux/netdevice.h> #include <net/dst.h> #include <net/inet_connection_sock.h> #include <net/tcp.h> #include <net/tls.h> #include <linux/skbuff_ref.h> #include "tls.h" #include "trace.h" /* device_offload_lock is used to synchronize tls_dev_add * against NETDEV_DOWN notifications. */ static DECLARE_RWSEM(device_offload_lock); static struct workqueue_struct *destruct_wq __read_mostly; static LIST_HEAD(tls_device_list); static LIST_HEAD(tls_device_down_list); static DEFINE_SPINLOCK(tls_device_lock); static struct page *dummy_page; static void tls_device_free_ctx(struct tls_context *ctx) { if (ctx->tx_conf == TLS_HW) kfree(tls_offload_ctx_tx(ctx)); if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); tls_ctx_free(NULL, ctx); } static void tls_device_tx_del_task(struct work_struct *work) { struct tls_offload_context_tx *offload_ctx = container_of(work, struct tls_offload_context_tx, destruct_work); struct tls_context *ctx = offload_ctx->ctx; struct net_device *netdev; /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); ctx->netdev = NULL; tls_device_free_ctx(ctx); } static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { struct net_device *netdev; unsigned long flags; bool async_cleanup; spin_lock_irqsave(&tls_device_lock, flags); if (unlikely(!refcount_dec_and_test(&ctx->refcount))) { spin_unlock_irqrestore(&tls_device_lock, flags); return; } list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */ /* Safe, because this is the destroy flow, refcount is 0, so * tls_device_down can't store this field in parallel. */ netdev = rcu_dereference_protected(ctx->netdev, !refcount_read(&ctx->refcount)); async_cleanup = netdev && ctx->tx_conf == TLS_HW; if (async_cleanup) { struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx); /* queue_work inside the spinlock * to make sure tls_device_down waits for that work. */ queue_work(destruct_wq, &offload_ctx->destruct_work); } spin_unlock_irqrestore(&tls_device_lock, flags); if (!async_cleanup) tls_device_free_ctx(ctx); } /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { struct dst_entry *dst = sk_dst_get(sk); struct net_device *netdev = NULL; if (likely(dst)) { netdev = netdev_sk_get_lowest_dev(dst->dev, sk); dev_hold(netdev); } dst_release(dst); return netdev; } static void destroy_record(struct tls_record_info *record) { int i; for (i = 0; i < record->num_frags; i++) __skb_frag_unref(&record->frags[i], false); kfree(record); } static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) { list_del(&info->list); destroy_record(info); } offload_ctx->retransmit_hint = NULL; } static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; if (info && !before(acked_seq, info->end_seq)) ctx->retransmit_hint = NULL; list_for_each_entry_safe(info, temp, &ctx->records_list, list) { if (before(acked_seq, info->end_seq)) break; list_del(&info->list); destroy_record(info); deleted_records++; } ctx->unacked_record_sn += deleted_records; spin_unlock_irqrestore(&ctx->lock, flags); } /* At this point, there should be no references on this * socket and no in-flight SKBs associated with this * socket, so it is safe to free all the resources. */ void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); tls_ctx->sk_destruct(sk); if (tls_ctx->tx_conf == TLS_HW) { if (ctx->open_record) destroy_record(ctx->open_record); delete_all_records(ctx); crypto_free_aead(ctx->aead_send); clean_acked_data_disable(inet_csk(sk)); } tls_device_queue_ctx_destruction(tls_ctx); } EXPORT_SYMBOL_GPL(tls_device_sk_destruct); void tls_device_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); tls_free_partial_record(sk, tls_ctx); } void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); trace_tls_device_tx_resync_req(sk, got_seq, exp_seq); WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags)); } EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request); static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx, u32 seq) { struct net_device *netdev; int err = 0; u8 *rcd_sn; tcp_write_collapse_fence(sk); rcd_sn = tls_ctx->tx.rec_seq; trace_tls_device_tx_resync_send(sk, seq, rcd_sn); down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (netdev) err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_TX); up_read(&device_offload_lock); if (err) return; clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags); } static void tls_append_frag(struct tls_record_info *record, struct page_frag *pfrag, int size) { skb_frag_t *frag; frag = &record->frags[record->num_frags - 1]; if (skb_frag_page(frag) == pfrag->page && skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) { skb_frag_size_add(frag, size); } else { ++frag; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, size); ++record->num_frags; get_page(pfrag->page); } pfrag->offset += size; record->len += size; } static int tls_push_record(struct sock *sk, struct tls_context *ctx, struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, int flags) { struct tls_prot_info *prot = &ctx->prot_info; struct tcp_sock *tp = tcp_sk(sk); skb_frag_t *frag; int i; record->end_seq = tp->write_seq + record->len; list_add_tail_rcu(&record->list, &offload_ctx->records_list); offload_ctx->open_record = NULL; if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags)) tls_device_resync_tx(sk, ctx, tp->write_seq); tls_advance_record_sn(sk, prot, &ctx->tx); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; sg_unmark_end(&offload_ctx->sg_tx_data[i]); sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), skb_frag_size(frag), skb_frag_off(frag)); sk_mem_charge(sk, skb_frag_size(frag)); get_page(skb_frag_page(frag)); } sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); /* all ready, send */ return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } static void tls_device_record_close(struct sock *sk, struct tls_context *ctx, struct tls_record_info *record, struct page_frag *pfrag, unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; struct page_frag dummy_tag_frag; /* append tag * device will fill in the tag, we just need to append a placeholder * use socket memory to improve coalescing (re-using a single buffer * increases frag count) * if we can't allocate memory now use the dummy page */ if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) && !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) { dummy_tag_frag.page = dummy_page; dummy_tag_frag.offset = 0; pfrag = &dummy_tag_frag; } tls_append_frag(record, pfrag, prot->tag_size); /* fill prepend */ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]), record->len - prot->overhead_size, record_type); } static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { struct tls_record_info *record; skb_frag_t *frag; record = kmalloc(sizeof(*record), GFP_KERNEL); if (!record) return -ENOMEM; frag = &record->frags[0]; skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset, prepend_size); get_page(pfrag->page); pfrag->offset += prepend_size; record->num_frags = 1; record->len = prepend_size; offload_ctx->open_record = record; return 0; } static int tls_do_allocation(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { int ret; if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } ret = tls_create_new_record(offload_ctx, pfrag, prepend_size); if (ret) return ret; if (pfrag->size > pfrag->offset) return 0; } if (!sk_page_frag_refill(sk, pfrag)) return -ENOMEM; return 0; } static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) { size_t pre_copy, nocache; pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1); if (pre_copy) { pre_copy = min(pre_copy, bytes); if (copy_from_iter(addr, pre_copy, i) != pre_copy) return -EFAULT; bytes -= pre_copy; addr += pre_copy; } nocache = round_down(bytes, SMP_CACHE_BYTES); if (copy_from_iter_nocache(addr, nocache, i) != nocache) return -EFAULT; bytes -= nocache; addr += nocache; if (bytes && copy_from_iter(addr, bytes, i) != bytes) return -EFAULT; return 0; } static int tls_push_data(struct sock *sk, struct iov_iter *iter, size_t size, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); struct tls_record_info *record; int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; bool more = false; bool done = false; int copy, rc = 0; long timeo; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES | MSG_EOR)) return -EOPNOTSUPP; if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR)) return -EINVAL; if (unlikely(sk->sk_err)) return -sk->sk_err; flags |= MSG_SENDPAGE_DECRYPTED; tls_push_record_flags = flags | MSG_MORE; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (tls_is_partially_sent_record(tls_ctx)) { rc = tls_push_partial_record(sk, tls_ctx, flags); if (rc < 0) return rc; } pfrag = sk_page_frag(sk); /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ max_open_record_len = TLS_MAX_PAYLOAD_SIZE + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); if (unlikely(rc)) { rc = sk_stream_wait_memory(sk, &timeo); if (!rc) continue; record = ctx->open_record; if (!record) break; handle_error: if (record_type != TLS_RECORD_TYPE_DATA) { /* avoid sending partial * record with type != * application_data */ size = orig_size; destroy_record(record); ctx->open_record = NULL; } else if (record->len > prot->prepend_size) { goto last_record; } break; } record = ctx->open_record; copy = min_t(size_t, size, max_open_record_len - record->len); if (copy && (flags & MSG_SPLICE_PAGES)) { struct page_frag zc_pfrag; struct page **pages = &zc_pfrag.page; size_t off; rc = iov_iter_extract_pages(iter, &pages, copy, 1, 0, &off); if (rc <= 0) { if (rc == 0) rc = -EIO; goto handle_error; } copy = rc; if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) { iov_iter_revert(iter, copy); rc = -EIO; goto handle_error; } zc_pfrag.offset = off; zc_pfrag.size = copy; tls_append_frag(record, &zc_pfrag, copy); } else if (copy) { copy = min_t(size_t, copy, pfrag->size - pfrag->offset); rc = tls_device_copy_data(page_address(pfrag->page) + pfrag->offset, copy, iter); if (rc) goto handle_error; tls_append_frag(record, pfrag, copy); } size -= copy; if (!size) { last_record: tls_push_record_flags = flags; if (flags & MSG_MORE) { more = true; break; } done = true; } if (done || record->len >= max_open_record_len || (record->num_frags >= MAX_SKB_FRAGS - 1)) { tls_device_record_close(sk, tls_ctx, record, pfrag, record_type); rc = tls_push_record(sk, tls_ctx, ctx, record, tls_push_record_flags); if (rc < 0) break; } } while (!done); tls_ctx->pending_open_record_frags = more; if (orig_size - size > 0) rc = orig_size - size; return rc; } int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; struct tls_context *tls_ctx = tls_get_ctx(sk); int rc; if (!tls_ctx->zerocopy_sendfile) msg->msg_flags &= ~MSG_SPLICE_PAGES; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (unlikely(msg->msg_controllen)) { rc = tls_process_cmsg(sk, msg, &record_type); if (rc) goto out; } rc = tls_push_data(sk, &msg->msg_iter, size, msg->msg_flags, record_type); out: release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); return rc; } void tls_device_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter iter = {}; if (!tls_is_partially_sent_record(tls_ctx)) return; mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (tls_is_partially_sent_record(tls_ctx)) { iov_iter_bvec(&iter, ITER_SOURCE, NULL, 0, 0); tls_push_data(sk, &iter, 0, 0, TLS_RECORD_TYPE_DATA); } release_sock(sk); mutex_unlock(&tls_ctx->tx_lock); } struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; struct tls_record_info *info, *last; info = context->retransmit_hint; if (!info || before(seq, info->end_seq - info->len)) { /* if retransmit_hint is irrelevant start * from the beginning of the list */ info = list_first_entry_or_null(&context->records_list, struct tls_record_info, list); if (!info) return NULL; /* send the start_marker record if seq number is before the * tls offload start marker sequence number. This record is * required to handle TCP packets which are before TLS offload * started. * And if it's not start marker, look if this seq number * belongs to the list. */ if (likely(!tls_record_is_start_marker(info))) { /* we have the first record, get the last record to see * if this seq number belongs to the list. */ last = list_last_entry(&context->records_list, struct tls_record_info, list); if (!between(seq, tls_record_start_seq(info), last->end_seq)) return NULL; } record_sn = context->unacked_record_sn; } /* We just need the _rcu for the READ_ONCE() */ rcu_read_lock(); list_for_each_entry_from_rcu(info, &context->records_list, list) { if (before(seq, info->end_seq)) { if (!context->retransmit_hint || after(info->end_seq, context->retransmit_hint->end_seq)) { context->hint_record_sn = record_sn; context->retransmit_hint = info; } *p_record_sn = record_sn; goto exit_rcu_unlock; } record_sn++; } info = NULL; exit_rcu_unlock: rcu_read_unlock(); return info; } EXPORT_SYMBOL(tls_get_record); static int tls_device_push_pending_record(struct sock *sk, int flags) { struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, NULL, 0, 0); return tls_push_data(sk, &iter, 0, flags, TLS_RECORD_TYPE_DATA); } void tls_device_write_space(struct sock *sk, struct tls_context *ctx) { if (tls_is_partially_sent_record(ctx)) { gfp_t sk_allocation = sk->sk_allocation; WARN_ON_ONCE(sk->sk_write_pending); sk->sk_allocation = GFP_ATOMIC; tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_DECRYPTED); sk->sk_allocation = sk_allocation; } } static void tls_device_resync_rx(struct tls_context *tls_ctx, struct sock *sk, u32 seq, u8 *rcd_sn) { struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); struct net_device *netdev; trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type); rcu_read_lock(); netdev = rcu_dereference(tls_ctx->netdev); if (netdev) netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn, TLS_OFFLOAD_CTX_DIR_RX); rcu_read_unlock(); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC); } static bool tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, s64 resync_req, u32 *seq, u16 *rcd_delta) { u32 is_async = resync_req & RESYNC_REQ_ASYNC; u32 req_seq = resync_req >> 32; u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); u16 i; *rcd_delta = 0; if (is_async) { /* shouldn't get to wraparound: * too long in async stage, something bad happened */ if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) return false; /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request */ if (before(*seq, req_seq)) return false; if (!after(*seq, req_end) && resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) resync_async->log[resync_async->loglen++] = *seq; resync_async->rcd_delta++; return false; } /* synchronous stage: check against the logged entries and * proceed to check the next entries if no match was found */ for (i = 0; i < resync_async->loglen; i++) if (req_seq == resync_async->log[i] && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) { *rcd_delta = resync_async->rcd_delta - i; *seq = req_seq; resync_async->loglen = 0; resync_async->rcd_delta = 0; return true; } resync_async->loglen = 0; resync_async->rcd_delta = 0; if (req_seq == *seq && atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) return true; return false; } void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; u16 rcd_delta; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) return; if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) return; prot = &tls_ctx->prot_info; rx_ctx = tls_offload_ctx_rx(tls_ctx); memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); switch (rx_ctx->resync_type) { case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ: resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; is_req_pending = resync_req; if (likely(!is_req_pending) || req_seq != seq || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT: if (likely(!rx_ctx->resync_nh_do_now)) return; /* head of next rec is already in, note that the sock_inq will * include the currently parsed message when called from parser */ sock_data = tcp_inq(sk); if (sock_data > rcd_len) { trace_tls_device_rx_resync_nh_delay(sk, sock_data, rcd_len); return; } rx_ctx->resync_nh_do_now = 0; seq += rcd_len; tls_bigint_increment(rcd_sn, prot->rec_seq_size); break; case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC: resync_req = atomic64_read(&rx_ctx->resync_async->req); is_req_pending = resync_req; if (likely(!is_req_pending)) return; if (!tls_device_rx_resync_async(rx_ctx->resync_async, resync_req, &seq, &rcd_delta)) return; tls_bigint_subtract(rcd_sn, rcd_delta); break; } tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); } static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, struct tls_offload_context_rx *ctx, struct sock *sk, struct sk_buff *skb) { struct strp_msg *rxm; /* device will request resyncs by itself based on stream scan */ if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT) return; /* already scheduled */ if (ctx->resync_nh_do_now) return; /* seen decrypted fragments since last fully-failed record */ if (ctx->resync_nh_reset) { ctx->resync_nh_reset = 0; ctx->resync_nh.decrypted_failed = 1; ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL; return; } if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt) return; /* doing resync, bump the next target in case it fails */ if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL) ctx->resync_nh.decrypted_tgt *= 2; else ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL; rxm = strp_msg(skb); /* head of next rec is already in, parser will sync for us */ if (tcp_inq(sk) > rxm->full_len) { trace_tls_device_rx_resync_nh_schedule(sk); ctx->resync_nh_do_now = 1; } else { struct tls_prot_info *prot = &tls_ctx->prot_info; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size); tls_bigint_increment(rcd_sn, prot->rec_seq_size); tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq, rcd_sn); } } static int tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) { struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); const struct tls_cipher_desc *cipher_desc; int err, offset, copy, data_len, pos; struct sk_buff *skb, *skb_iter; struct scatterlist sg[1]; struct strp_msg *rxm; char *orig_buf, *buf; cipher_desc = get_cipher_desc(tls_ctx->crypto_recv.info.cipher_type); DEBUG_NET_WARN_ON_ONCE(!cipher_desc || !cipher_desc->offloadable); rxm = strp_msg(tls_strp_msg(sw_ctx)); orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv, sk->sk_allocation); if (!orig_buf) return -ENOMEM; buf = orig_buf; err = tls_strp_msg_cow(sw_ctx); if (unlikely(err)) goto free_buf; skb = tls_strp_msg(sw_ctx); rxm = strp_msg(skb); offset = rxm->offset; sg_init_table(sg, 1); sg_set_buf(&sg[0], buf, rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv); err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_desc->iv); if (err) goto free_buf; /* We are interested only in the decrypted data not the auth */ err = decrypt_skb(sk, sg); if (err != -EBADMSG) goto free_buf; else err = 0; data_len = rxm->full_len - cipher_desc->tag; if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); if (skb->decrypted) { err = skb_store_bits(skb, offset, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; } pos = skb_pagelen(skb); skb_walk_frags(skb, skb_iter) { int frag_pos; /* Practically all frags must belong to msg if reencrypt * is needed with current strparser and coalescing logic, * but strparser may "get optimized", so let's be safe. */ if (pos + skb_iter->len <= offset) goto done_with_frag; if (pos >= data_len + rxm->offset) break; frag_pos = offset - pos; copy = min_t(int, skb_iter->len - frag_pos, data_len + rxm->offset - offset); if (skb_iter->decrypted) { err = skb_store_bits(skb_iter, frag_pos, buf, copy); if (err) goto free_buf; } offset += copy; buf += copy; done_with_frag: pos += skb_iter->len; } free_buf: kfree(orig_buf); return err; } int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) { struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb = tls_strp_msg(sw_ctx); struct strp_msg *rxm = strp_msg(skb); int is_decrypted, is_encrypted; if (!tls_strp_msg_mixed_decrypted(sw_ctx)) { is_decrypted = skb->decrypted; is_encrypted = !is_decrypted; } else { is_decrypted = 0; is_encrypted = 0; } trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, tls_ctx->rx.rec_seq, rxm->full_len, is_encrypted, is_decrypted); if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) { if (likely(is_encrypted || is_decrypted)) return is_decrypted; /* After tls_device_down disables the offload, the next SKB will * likely have initial fragments decrypted, and final ones not * decrypted. We need to reencrypt that single SKB. */ return tls_device_reencrypt(sk, tls_ctx); } /* Return immediately if the record is either entirely plaintext or * entirely ciphertext. Otherwise handle reencrypt partially decrypted * record. */ if (is_decrypted) { ctx->resync_nh_reset = 1; return is_decrypted; } if (is_encrypted) { tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb); return 0; } ctx->resync_nh_reset = 1; return tls_device_reencrypt(sk, tls_ctx); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, struct net_device *netdev) { if (sk->sk_destruct != tls_device_sk_destruct) { refcount_set(&ctx->refcount, 1); dev_hold(netdev); RCU_INIT_POINTER(ctx->netdev, netdev); spin_lock_irq(&tls_device_lock); list_add_tail(&ctx->list, &tls_device_list); spin_unlock_irq(&tls_device_lock); ctx->sk_destruct = sk->sk_destruct; smp_store_release(&sk->sk_destruct, tls_device_sk_destruct); } } static struct tls_offload_context_tx *alloc_offload_ctx_tx(struct tls_context *ctx) { struct tls_offload_context_tx *offload_ctx; __be64 rcd_sn; offload_ctx = kzalloc(sizeof(*offload_ctx), GFP_KERNEL); if (!offload_ctx) return NULL; INIT_WORK(&offload_ctx->destruct_work, tls_device_tx_del_task); INIT_LIST_HEAD(&offload_ctx->records_list); spin_lock_init(&offload_ctx->lock); sg_init_table(offload_ctx->sg_tx_data, ARRAY_SIZE(offload_ctx->sg_tx_data)); /* start at rec_seq - 1 to account for the start marker record */ memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn)); offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1; offload_ctx->ctx = ctx; return offload_ctx; } int tls_set_device_offload(struct sock *sk) { struct tls_record_info *start_marker_record; struct tls_offload_context_tx *offload_ctx; const struct tls_cipher_desc *cipher_desc; struct tls_crypto_info *crypto_info; struct tls_prot_info *prot; struct net_device *netdev; struct tls_context *ctx; char *iv, *rec_seq; int rc; ctx = tls_get_ctx(sk); prot = &ctx->prot_info; if (ctx->priv_ctx_tx) return -EEXIST; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_TX)) { rc = -EOPNOTSUPP; goto release_netdev; } crypto_info = &ctx->crypto_send.info; if (crypto_info->version != TLS_1_2_VERSION) { rc = -EOPNOTSUPP; goto release_netdev; } cipher_desc = get_cipher_desc(crypto_info->cipher_type); if (!cipher_desc || !cipher_desc->offloadable) { rc = -EINVAL; goto release_netdev; } rc = init_prot_info(prot, crypto_info, cipher_desc); if (rc) goto release_netdev; iv = crypto_info_iv(crypto_info, cipher_desc); rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv); memcpy(ctx->tx.rec_seq, rec_seq, cipher_desc->rec_seq); start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); if (!start_marker_record) { rc = -ENOMEM; goto release_netdev; } offload_ctx = alloc_offload_ctx_tx(ctx); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; } rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); if (rc) goto free_offload_ctx; start_marker_record->end_seq = tcp_sk(sk)->write_seq; start_marker_record->len = 0; start_marker_record->num_frags = 0; list_add_tail(&start_marker_record->list, &offload_ctx->records_list); clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. * So mark the last skb in the write queue as end of record. */ tcp_write_collapse_fence(sk); /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } ctx->priv_ctx_tx = offload_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, &ctx->crypto_send.info, tcp_sk(sk)->write_seq); trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX, tcp_sk(sk)->write_seq, rec_seq, rc); if (rc) goto release_lock; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); /* following this assignment tls_is_skb_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); dev_put(netdev); return 0; release_lock: up_read(&device_offload_lock); clean_acked_data_disable(inet_csk(sk)); crypto_free_aead(offload_ctx->aead_send); free_offload_ctx: kfree(offload_ctx); ctx->priv_ctx_tx = NULL; free_marker_record: kfree(start_marker_record); release_netdev: dev_put(netdev); return rc; } int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; struct net_device *netdev; int rc = 0; if (ctx->crypto_recv.info.version != TLS_1_2_VERSION) return -EOPNOTSUPP; netdev = get_netdev_for_sock(sk); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; } if (!(netdev->features & NETIF_F_HW_TLS_RX)) { rc = -EOPNOTSUPP; goto release_netdev; } /* Avoid offloading if the device is down * We don't want to offload new flows after * the NETDEV_DOWN event * * device_offload_lock is taken in tls_devices's NETDEV_DOWN * handler thus protecting from the device going down before * ctx was added to tls_device_list. */ down_read(&device_offload_lock); if (!(netdev->flags & IFF_UP)) { rc = -EINVAL; goto release_lock; } context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) { rc = -ENOMEM; goto release_lock; } context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; rc = tls_set_sw_offload(sk, 0); if (rc) goto release_ctx; rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, &ctx->crypto_recv.info, tcp_sk(sk)->copied_seq); info = (void *)&ctx->crypto_recv.info; trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX, tcp_sk(sk)->copied_seq, info->rec_seq, rc); if (rc) goto free_sw_resources; tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); dev_put(netdev); return 0; free_sw_resources: up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_lock: up_read(&device_offload_lock); release_netdev: dev_put(netdev); return rc; } void tls_device_offload_cleanup_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct net_device *netdev; down_read(&device_offload_lock); netdev = rcu_dereference_protected(tls_ctx->netdev, lockdep_is_held(&device_offload_lock)); if (!netdev) goto out; netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, TLS_OFFLOAD_CTX_DIR_RX); if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); rcu_assign_pointer(tls_ctx->netdev, NULL); } else { set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); tls_sw_release_resources_rx(sk); } static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; unsigned long flags; LIST_HEAD(list); /* Request a write lock to block new offload attempts */ down_write(&device_offload_lock); spin_lock_irqsave(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) { struct net_device *ctx_netdev = rcu_dereference_protected(ctx->netdev, lockdep_is_held(&device_offload_lock)); if (ctx_netdev != netdev || !refcount_inc_not_zero(&ctx->refcount)) continue; list_move(&ctx->list, &list); } spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { /* Stop offloaded TX and switch to the fallback. * tls_is_skb_tx_device_offloaded will return false. */ WRITE_ONCE(ctx->sk->sk_validate_xmit_skb, tls_validate_xmit_skb_sw); /* Stop the RX and TX resync. * tls_dev_resync must not be called after tls_dev_del. */ rcu_assign_pointer(ctx->netdev, NULL); /* Start skipping the RX resync logic completely. */ set_bit(TLS_RX_DEV_DEGRADED, &ctx->flags); /* Sync with inflight packets. After this point: * TX: no non-encrypted packets will be passed to the driver. * RX: resync requests from the driver will be ignored. */ synchronize_net(); /* Release the offload context on the driver side. */ if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); if (ctx->rx_conf == TLS_HW && !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); dev_put(netdev); /* Move the context to a separate list for two reasons: * 1. When the context is deallocated, list_del is called. * 2. It's no longer an offloaded context, so we don't want to * run offload-specific code on this context. */ spin_lock_irqsave(&tls_device_lock, flags); list_move_tail(&ctx->list, &tls_device_down_list); spin_unlock_irqrestore(&tls_device_lock, flags); /* Device contexts for RX and TX will be freed in on sk_destruct * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW. * Now release the ref taken above. */ if (refcount_dec_and_test(&ctx->refcount)) { /* sk_destruct ran after tls_device_down took a ref, and * it returned early. Complete the destruction here. */ list_del(&ctx->list); tls_device_free_ctx(ctx); } } up_write(&device_offload_lock); flush_workqueue(destruct_wq); return NOTIFY_DONE; } static int tls_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!dev->tlsdev_ops && !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: if (netif_is_bond_master(dev)) return NOTIFY_DONE; if ((dev->features & NETIF_F_HW_TLS_RX) && !dev->tlsdev_ops->tls_dev_resync) return NOTIFY_BAD; if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) return NOTIFY_DONE; else return NOTIFY_BAD; case NETDEV_DOWN: return tls_device_down(dev); } return NOTIFY_DONE; } static struct notifier_block tls_dev_notifier = { .notifier_call = tls_dev_event, }; int __init tls_device_init(void) { int err; dummy_page = alloc_page(GFP_KERNEL); if (!dummy_page) return -ENOMEM; destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); if (!destruct_wq) { err = -ENOMEM; goto err_free_dummy; } err = register_netdevice_notifier(&tls_dev_notifier); if (err) goto err_destroy_wq; return 0; err_destroy_wq: destroy_workqueue(destruct_wq); err_free_dummy: put_page(dummy_page); return err; } void __exit tls_device_cleanup(void) { unregister_netdevice_notifier(&tls_dev_notifier); destroy_workqueue(destruct_wq); clean_acked_data_flush(); put_page(dummy_page); } |
| 186 1 186 24 24 11 22 2 24 24 17 9 299 341 299 339 300 299 301 186 301 297 203 341 340 341 339 341 341 340 341 340 341 340 339 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 | #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> #include <linux/rmap.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #ifndef CONFIG_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; /* Limit batching if we have delayed rmaps pending */ if (tlb->delayed_rmap && tlb->active != &tlb->local) return false; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (!batch) return false; tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return true; } #ifdef CONFIG_SMP static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) { struct encoded_page **pages = batch->encoded_pages; for (int i = 0; i < batch->nr; i++) { struct encoded_page *enc = pages[i]; if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page = encoded_page_ptr(enc); unsigned int nr_pages = 1; if (unlikely(encoded_page_flags(enc) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages = encoded_nr_pages(pages[++i]); folio_remove_rmap_ptes(page_folio(page), page, nr_pages, vma); } } } /** * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB * @tlb: the current mmu_gather * @vma: The memory area from which the pages are being removed. * * Note that because of how tlb_next_batch() above works, we will * never start multiple new batches with pending delayed rmaps, so * we only need to walk through the current active batch and the * original local one. */ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { if (!tlb->delayed_rmap) return; tlb_flush_rmap_batch(&tlb->local, vma); if (tlb->active != &tlb->local) tlb_flush_rmap_batch(tlb->active, vma); tlb->delayed_rmap = 0; } #endif /* * We might end up freeing a lot of pages. Reschedule on a regular * basis to avoid soft lockups in configurations without full * preemption enabled. The magic number of 512 folios seems to work. */ #define MAX_NR_FOLIOS_PER_FREE 512 static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) { struct encoded_page **pages = batch->encoded_pages; unsigned int nr, nr_pages; while (batch->nr) { if (!page_poisoning_enabled_static() && !want_init_on_free()) { nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr); /* * Make sure we cover page + nr_pages, and don't leave * nr_pages behind when capping the number of entries. */ if (unlikely(encoded_page_flags(pages[nr - 1]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr++; } else { /* * With page poisoning and init_on_free, the time it * takes to free memory grows proportionally with the * actual memory size. Therefore, limit based on the * actual memory size and not the number of involved * folios. */ for (nr = 0, nr_pages = 0; nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; nr++) { if (unlikely(encoded_page_flags(pages[nr]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr_pages += encoded_nr_pages(pages[++nr]); else nr_pages++; } } free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; cond_resched(); } } static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) __tlb_batch_free_encoded_pages(batch); tlb->active = &tlb->local; } static void tlb_batch_list_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch, *next; for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb->local.next = NULL; } static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap, int page_size) { int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE); VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1)); #endif batch = tlb->active; /* * Add the page and check if we are full. If so * force a flush. */ if (likely(nr_pages == 1)) { batch->encoded_pages[batch->nr++] = encode_page(page, flags); } else { flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT; batch->encoded_pages[batch->nr++] = encode_page(page, flags); batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages); } /* * Make sure that we can always add another "page" + "nr_pages", * requiring two entries instead of only a single one. */ if (batch->nr >= batch->max - 1) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); return false; } bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap) { return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, PAGE_SIZE); } bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size) { return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); } #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE static void __tlb_remove_table_free(struct mmu_table_batch *batch) { int i; for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); } #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. * * This is needed by some architectures to implement software pagetable walkers. * * gup_fast() and other software pagetable walkers do a lockless page-table * walk and therefore needs some synchronization with the freeing of the page * directories. The chosen means to accomplish that is by disabling IRQs over * the walk. * * Architectures that use IPIs to flush TLBs will then automagically DTRT, * since we unlink the page, flush TLBs, free the page. Since the disabling of * IRQs delays the completion of the TLB flush we can never observe an already * freed page. * * Architectures that do not have this (PPC) need to delay the freeing by some * other means, this is that means. * * What we do is batch the freed directory pages (tables) and RCU free them. * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling * holds off grace periods. * * However, in order to batch these pages we need to allocate storage, this * allocation is deep inside the MM code and can thus easily fail on memory * pressure. To guarantee progress we fall back to single table freeing, see * the implementation of tlb_remove_table_one(). * */ static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ } void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on * IRQ disabling. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); } static void tlb_remove_table_rcu(struct rcu_head *head) { __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); } static void tlb_remove_table_free(struct mmu_table_batch *batch) { call_rcu(&batch->rcu, tlb_remove_table_rcu); } #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ /* * If we want tlb_remove_table() to imply TLB invalidates. */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { if (tlb_needs_table_invalidate()) { /* * Invalidate page-table caches used by hardware walkers. Then * we still need to RCU-sched wait while freeing the pages * because software walkers can still be in-flight. */ tlb_flush_mmu_tlbonly(tlb); } } static void tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } static void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { tlb_table_invalidate(tlb); tlb_remove_table_free(*batch); *batch = NULL; } } void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); } static inline void tlb_table_init(struct mmu_gather *tlb) { tlb->batch = NULL; } #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_table_flush(struct mmu_gather *tlb) { } static inline void tlb_table_init(struct mmu_gather *tlb) { } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { tlb_table_flush(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_pages_flush(tlb); #endif } void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); } static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { tlb->mm = mm; tlb->fullmm = fullmm; #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; tlb->local.next = NULL; tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; tlb->batch_count = 0; #endif tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, false); } /** * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space * * In this case, @mm is without users and we're going to destroy the * full address space (exit/execve). * * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. */ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) { __tlb_gather_mmu(tlb, mm, true); } /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish * * Called at the end of the shootdown operation to free up any resources that * were required. */ void tlb_finish_mmu(struct mmu_gather *tlb) { /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs * and result in having stale TLB entries. So flush TLB forcefully * if we detect parallel PTE batching threads. * * However, some syscalls, e.g. munmap(), may free page tables, this * needs force flush everything in the given range. Otherwise this * may result in having stale TLB entries for some architectures, * e.g. aarch64, that could specify flush what level TLB. */ if (mm_tlb_flush_nested(tlb->mm)) { /* * The aarch64 yields better performance with fullmm by * avoiding multiple CPUs spamming TLBI messages at the * same time. * * On x86 non-fullmm doesn't yield significant difference * against fullmm. */ tlb->fullmm = 1; __tlb_reset_range(tlb); tlb->freed_tables = 1; } tlb_flush_mmu(tlb); #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif dec_tlb_flush_pending(tlb->mm); } |
| 29 15 20 20 20 2 1 2 2 12 7 1 1 2 4 2 1 1 2 2 2 2 3 17 8 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 | // SPDX-License-Identifier: GPL-2.0 #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/nsfs.h> #include <linux/uaccess.h> #include "mount.h" #include "internal.h" static struct vfsmount *nsfs_mnt; static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { .llseek = no_llseek, .unlocked_ioctl = ns_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = d_inode(dentry); struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; return dynamic_dname(buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); } const struct dentry_operations ns_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = ns_dname, .d_prune = stashed_dentry_prune, }; static void nsfs_evict(struct inode *inode) { struct ns_common *ns = inode->i_private; clear_inode(inode); ns->ops->put(ns); } int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { struct ns_common *ns; ns = ns_get_cb(private_data); if (!ns) return -ENOENT; return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path); } struct ns_get_path_task_args { const struct proc_ns_operations *ns_ops; struct task_struct *task; }; static struct ns_common *ns_get_path_task(void *private_data) { struct ns_get_path_task_args *args = private_data; return args->ns_ops->get(args->task); } int ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_get_path_task_args args = { .ns_ops = ns_ops, .task = task, }; return ns_get_path_cb(path, ns_get_path_task, &args); } /** * open_namespace - open a namespace * @ns: the namespace to open * * This will consume a reference to @ns indendent of success or failure. * * Return: A file descriptor on success or a negative error code on failure. */ int open_namespace(struct ns_common *ns) { struct path path __free(path_put) = {}; struct file *f; int err; /* call first to consume reference */ err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); if (err < 0) return err; CLASS(get_unused_fd, fd)(O_CLOEXEC); if (fd < 0) return fd; f = dentry_open(&path, O_RDONLY, current_cred()); if (IS_ERR(f)) return PTR_ERR(f); fd_install(fd, f); return take_fd(fd); } int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct ns_common *relative; relative = get_ns(ns); if (IS_ERR(relative)) return PTR_ERR(relative); return open_namespace(relative); } EXPORT_SYMBOL_GPL(open_related_ns); static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct user_namespace *user_ns; struct pid_namespace *pid_ns; struct task_struct *tsk; struct ns_common *ns = get_proc_ns(file_inode(filp)); uid_t __user *argp; uid_t uid; int ret; switch (ioctl) { case NS_GET_USERNS: return open_related_ns(ns, ns_get_owner); case NS_GET_PARENT: if (!ns->ops->get_parent) return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: return ns->ops->type; case NS_GET_OWNER_UID: if (ns->ops->type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); case NS_GET_MNTNS_ID: { struct mnt_namespace *mnt_ns; __u64 __user *idp; __u64 id; if (ns->ops->type != CLONE_NEWNS) return -EINVAL; mnt_ns = container_of(ns, struct mnt_namespace, ns); idp = (__u64 __user *)arg; id = mnt_ns->seq; return put_user(id, idp); } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: fallthrough; case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { if (ns->ops->type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; pid_ns = container_of(ns, struct pid_namespace, ns); guard(rcu)(); if (ioctl == NS_GET_PID_IN_PIDNS || ioctl == NS_GET_TGID_IN_PIDNS) tsk = find_task_by_vpid(arg); else tsk = find_task_by_pid_ns(arg, pid_ns); if (!tsk) break; switch (ioctl) { case NS_GET_PID_FROM_PIDNS: ret = task_pid_vnr(tsk); break; case NS_GET_TGID_FROM_PIDNS: ret = task_tgid_vnr(tsk); break; case NS_GET_PID_IN_PIDNS: ret = task_pid_nr_ns(tsk, pid_ns); break; case NS_GET_TGID_IN_PIDNS: ret = task_tgid_nr_ns(tsk, pid_ns); break; default: ret = 0; break; } if (!ret) ret = -ESRCH; break; } default: ret = -ENOTTY; } return ret; } int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_common *ns; int res = -ENOENT; const char *name; ns = ns_ops->get(task); if (ns) { name = ns_ops->real_ns_name ? : ns_ops->name; res = snprintf(buf, size, "%s:[%u]", name, ns->inum); ns_ops->put(ns); } return res; } bool proc_ns_file(const struct file *file) { return file->f_op == &ns_file_operations; } /** * ns_match() - Returns true if current namespace matches dev/ino provided. * @ns: current namespace * @dev: dev_t from nsfs that will be matched against current nsfs * @ino: ino_t from nsfs that will be matched against current nsfs * * Return: true if dev and ino matches the current nsfs. */ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) { return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev); } static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); const struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); return 0; } static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, .show_path = nsfs_show_path, }; static int nsfs_init_inode(struct inode *inode, void *data) { struct ns_common *ns = data; inode->i_private = data; inode->i_mode |= S_IRUGO; inode->i_fop = &ns_file_operations; inode->i_ino = ns->inum; return 0; } static void nsfs_put_data(void *data) { struct ns_common *ns = data; ns->ops->put(ns); } static const struct stashed_operations nsfs_stashed_ops = { .init_inode = nsfs_init_inode, .put_data = nsfs_put_data, }; static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; } static struct file_system_type nsfs = { .name = "nsfs", .init_fs_context = nsfs_init_fs_context, .kill_sb = kill_anon_super, }; void __init nsfs_init(void) { nsfs_mnt = kern_mount(&nsfs); if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; } |
| 200 2 2 1 2 2 125 113 5 1 8 1 13 125 125 125 125 125 244 245 200 124 125 198 200 200 2 6 7 7 123 125 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_attr_sf.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_errortag.h" #include "xfs_health.h" /* * xfs_attr_leaf.c * * Routines to implement leaf blocks of attributes as Btrees of hashed names. */ /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ /* * Routines used for growing the Btree. */ STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, struct xfs_buf **bpp); STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_da_args *args, int freemap_index); STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_buf *leaf_buffer); STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, xfs_da_state_blk_t *leaf_blk_1, struct xfs_attr3_icleaf_hdr *ichdr1, xfs_da_state_blk_t *leaf_blk_2, struct xfs_attr3_icleaf_hdr *ichdr2, int *number_entries_in_blk1, int *number_usedbytes_in_blk1); /* * Utility routines. */ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, struct xfs_attr_leafblock *src_leaf, struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, struct xfs_attr_leafblock *dst_leaf, struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); /* * attr3 block 'firstused' conversion helpers. * * firstused refers to the offset of the first used byte of the nameval region * of an attr leaf block. The region starts at the tail of the block and expands * backwards towards the middle. As such, firstused is initialized to the block * size for an empty leaf block and is reduced from there. * * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k. * The in-core firstused field is 32-bit and thus supports the maximum fsb size. * The on-disk field is only 16-bit, however, and overflows at 64k. Since this * only occurs at exactly 64k, we use zero as a magic on-disk value to represent * the attr block size. The following helpers manage the conversion between the * in-core and on-disk formats. */ static void xfs_attr3_leaf_firstused_from_disk( struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from) { struct xfs_attr3_leaf_hdr *hdr3; if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { hdr3 = (struct xfs_attr3_leaf_hdr *) from; to->firstused = be16_to_cpu(hdr3->firstused); } else { to->firstused = be16_to_cpu(from->hdr.firstused); } /* * Convert from the magic fsb size value to actual blocksize. This * should only occur for empty blocks when the block size overflows * 16-bits. */ if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) { ASSERT(!to->count && !to->usedbytes); ASSERT(geo->blksize > USHRT_MAX); to->firstused = geo->blksize; } } static void xfs_attr3_leaf_firstused_to_disk( struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from) { struct xfs_attr3_leaf_hdr *hdr3; uint32_t firstused; /* magic value should only be seen on disk */ ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF); /* * Scale down the 32-bit in-core firstused value to the 16-bit on-disk * value. This only overflows at the max supported value of 64k. Use the * magic on-disk value to represent block size in this case. */ firstused = from->firstused; if (firstused > USHRT_MAX) { ASSERT(from->firstused == geo->blksize); firstused = XFS_ATTR3_LEAF_NULLOFF; } if (from->magic == XFS_ATTR3_LEAF_MAGIC) { hdr3 = (struct xfs_attr3_leaf_hdr *) to; hdr3->firstused = cpu_to_be16(firstused); } else { to->hdr.firstused = cpu_to_be16(firstused); } } void xfs_attr3_leaf_hdr_from_disk( struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from) { int i; ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; to->forw = be32_to_cpu(hdr3->info.hdr.forw); to->back = be32_to_cpu(hdr3->info.hdr.back); to->magic = be16_to_cpu(hdr3->info.hdr.magic); to->count = be16_to_cpu(hdr3->count); to->usedbytes = be16_to_cpu(hdr3->usedbytes); xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = hdr3->holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); } return; } to->forw = be32_to_cpu(from->hdr.info.forw); to->back = be32_to_cpu(from->hdr.info.back); to->magic = be16_to_cpu(from->hdr.info.magic); to->count = be16_to_cpu(from->hdr.count); to->usedbytes = be16_to_cpu(from->hdr.usedbytes); xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = from->hdr.holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); } } void xfs_attr3_leaf_hdr_to_disk( struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from) { int i; ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || from->magic == XFS_ATTR3_LEAF_MAGIC); if (from->magic == XFS_ATTR3_LEAF_MAGIC) { struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; hdr3->info.hdr.forw = cpu_to_be32(from->forw); hdr3->info.hdr.back = cpu_to_be32(from->back); hdr3->info.hdr.magic = cpu_to_be16(from->magic); hdr3->count = cpu_to_be16(from->count); hdr3->usedbytes = cpu_to_be16(from->usedbytes); xfs_attr3_leaf_firstused_to_disk(geo, to, from); hdr3->holes = from->holes; hdr3->pad1 = 0; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); } return; } to->hdr.info.forw = cpu_to_be32(from->forw); to->hdr.info.back = cpu_to_be32(from->back); to->hdr.info.magic = cpu_to_be16(from->magic); to->hdr.count = cpu_to_be16(from->count); to->hdr.usedbytes = cpu_to_be16(from->usedbytes); xfs_attr3_leaf_firstused_to_disk(geo, to, from); to->hdr.holes = from->holes; to->hdr.pad1 = 0; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); } } static xfs_failaddr_t xfs_attr3_leaf_verify_entry( struct xfs_mount *mp, char *buf_end, struct xfs_attr_leafblock *leaf, struct xfs_attr3_icleaf_hdr *leafhdr, struct xfs_attr_leaf_entry *ent, int idx, __u32 *last_hashval) { struct xfs_attr_leaf_name_local *lentry; struct xfs_attr_leaf_name_remote *rentry; char *name_end; unsigned int nameidx; unsigned int namesize; __u32 hashval; /* hash order check */ hashval = be32_to_cpu(ent->hashval); if (hashval < *last_hashval) return __this_address; *last_hashval = hashval; nameidx = be16_to_cpu(ent->nameidx); if (nameidx < leafhdr->firstused || nameidx >= mp->m_attr_geo->blksize) return __this_address; /* * Check the name information. The namelen fields are u8 so we can't * possibly exceed the maximum name length of 255 bytes. */ if (ent->flags & XFS_ATTR_LOCAL) { lentry = xfs_attr3_leaf_name_local(leaf, idx); namesize = xfs_attr_leaf_entsize_local(lentry->namelen, be16_to_cpu(lentry->valuelen)); name_end = (char *)lentry + namesize; if (lentry->namelen == 0) return __this_address; } else { rentry = xfs_attr3_leaf_name_remote(leaf, idx); namesize = xfs_attr_leaf_entsize_remote(rentry->namelen); name_end = (char *)rentry + namesize; if (rentry->namelen == 0) return __this_address; if (!(ent->flags & XFS_ATTR_INCOMPLETE) && rentry->valueblk == 0) return __this_address; } if (name_end > buf_end) return __this_address; return NULL; } /* * Validate an attribute leaf block. * * Empty leaf blocks can occur under the following circumstances: * * 1. setxattr adds a new extended attribute to a file; * 2. The file has zero existing attributes; * 3. The attribute is too large to fit in the attribute fork; * 4. The attribute is small enough to fit in a leaf block; * 5. A log flush occurs after committing the transaction that creates * the (empty) leaf block; and * 6. The filesystem goes down after the log flush but before the new * attribute can be committed to the leaf block. * * Hence we need to ensure that we don't fail the validation purely * because the leaf is empty. */ static xfs_failaddr_t xfs_attr3_leaf_verify( struct xfs_buf *bp) { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_mount *mp = bp->b_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_entry *ent; char *buf_end; uint32_t end; /* must be 32bit - see below */ __u32 last_hashval = 0; int i; xfs_failaddr_t fa; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); if (fa) return fa; /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ if (ichdr.firstused > mp->m_attr_geo->blksize) return __this_address; if (ichdr.firstused < xfs_attr3_leaf_hdr_size(leaf)) return __this_address; /* Make sure the entries array doesn't crash into the name info. */ entries = xfs_attr3_leaf_entryp(bp->b_addr); if ((char *)&entries[ichdr.count] > (char *)bp->b_addr + ichdr.firstused) return __this_address; /* * NOTE: This verifier historically failed empty leaf buffers because * we expect the fork to be in another format. Empty attr fork format * conversions are possible during xattr set, however, and format * conversion is not atomic with the xattr set that triggers it. We * cannot assume leaf blocks are non-empty until that is addressed. */ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; for (i = 0, ent = entries; i < ichdr.count; ent++, i++) { fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr, ent, i, &last_hashval); if (fa) return fa; } /* * Quickly check the freemap information. Attribute data has to be * aligned to 4-byte boundaries, and likewise for the free space. * * Note that for 64k block size filesystems, the freemap entries cannot * overflow as they are only be16 fields. However, when checking end * pointer of the freemap, we have to be careful to detect overflows and * so use uint32_t for those checks. */ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr.freemap[i].base > mp->m_attr_geo->blksize) return __this_address; if (ichdr.freemap[i].base & 0x3) return __this_address; if (ichdr.freemap[i].size > mp->m_attr_geo->blksize) return __this_address; if (ichdr.freemap[i].size & 0x3) return __this_address; /* be care of 16 bit overflows here */ end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size; if (end < ichdr.freemap[i].base) return __this_address; if (end > mp->m_attr_geo->blksize) return __this_address; } return NULL; } xfs_failaddr_t xfs_attr3_leaf_header_check( struct xfs_buf *bp, xfs_ino_t owner) { struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { struct xfs_attr3_leafblock *hdr3 = bp->b_addr; if (hdr3->hdr.info.hdr.magic != cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) return __this_address; if (be64_to_cpu(hdr3->hdr.info.owner) != owner) return __this_address; } return NULL; } static void xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; fa = xfs_attr3_leaf_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } if (!xfs_has_crc(mp)) return; if (bip) hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF); } /* * leaf/node format detection on trees is sketchy, so a node read can be done on * leaf level blocks when detection identifies the tree as a node format tree * incorrectly. In this case, we need to swap the verifier to match the correct * format of the block being read. */ static void xfs_attr3_leaf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_attr3_leaf_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, }; int xfs_attr3_leaf_read( struct xfs_trans *tp, struct xfs_inode *dp, xfs_ino_t owner, xfs_dablk_t bno, struct xfs_buf **bpp) { xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); if (err || !(*bpp)) return err; fa = xfs_attr3_leaf_header_check(*bpp, owner); if (fa) { __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } if (tp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); return 0; } /*======================================================================== * Namespace helper routines *========================================================================*/ /* * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE * flag on disk - if there's an incomplete attr then recovery needs to tear it * down. If there's no incomplete attr, then recovery needs to tear that attr * down to replace it with the attr that has been logged. In this case, the * INCOMPLETE flag will not be set in attr->attr_filter, but rather * XFS_DA_OP_RECOVERY will be set in args->op_flags. */ static inline unsigned int xfs_attr_match_mask(const struct xfs_da_args *args) { if (args->op_flags & XFS_DA_OP_RECOVERY) return XFS_ATTR_NSP_ONDISK_MASK; return XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE; } static inline bool xfs_attr_parent_match( const struct xfs_da_args *args, const void *value, unsigned int valuelen) { ASSERT(args->value != NULL); /* Parent pointers do not use remote values */ if (!value) return false; /* * The only value we support is a parent rec. However, we'll accept * any valuelen so that offline repair can delete ATTR_PARENT values * that are not parent pointers. */ if (valuelen != args->valuelen) return false; return memcmp(args->value, value, valuelen) == 0; } static bool xfs_attr_match( struct xfs_da_args *args, unsigned int attr_flags, const unsigned char *name, unsigned int namelen, const void *value, unsigned int valuelen) { unsigned int mask = xfs_attr_match_mask(args); if (args->namelen != namelen) return false; if ((args->attr_filter & mask) != (attr_flags & mask)) return false; if (memcmp(args->name, name, namelen) != 0) return false; if (attr_flags & XFS_ATTR_PARENT) return xfs_attr_parent_match(args, value, valuelen); return true; } static int xfs_attr_copy_value( struct xfs_da_args *args, unsigned char *value, int valuelen) { /* * Parent pointer lookups require the caller to specify the name and * value, so don't copy anything. */ if (args->attr_filter & XFS_ATTR_PARENT) return 0; /* * No copy if all we have to do is get the length */ if (!args->valuelen) { args->valuelen = valuelen; return 0; } /* * No copy if the length of the existing buffer is too small */ if (args->valuelen < valuelen) { args->valuelen = valuelen; return -ERANGE; } if (!args->value) { args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP); if (!args->value) return -ENOMEM; } args->valuelen = valuelen; /* remote block xattr requires IO for copy-in */ if (args->rmtblkno) return xfs_attr_rmtval_get(args); /* * This is to prevent a GCC warning because the remote xattr case * doesn't have a value to pass in. In that case, we never reach here, * but GCC can't work that out and so throws a "passing NULL to * memcpy" warning. */ if (!value) return -EINVAL; memcpy(args->value, value, valuelen); return 0; } /*======================================================================== * External routines when attribute fork size < XFS_LITINO(mp). *========================================================================*/ /* * Query whether the total requested number of attr fork bytes of extended * attribute space will be able to fit inline. * * Returns zero if not, else the i_forkoff fork offset to be used in the * literal area for attribute data once the new bytes have been added. * * i_forkoff must be 8 byte aligned, hence is stored as a >>3 value; * special case for dev/uuid inodes, they have fixed size data forks. */ int xfs_attr_shortform_bytesfit( struct xfs_inode *dp, int bytes) { struct xfs_mount *mp = dp->i_mount; int64_t dsize; int minforkoff; int maxforkoff; int offset; /* * Check if the new size could fit at all first: */ if (bytes > XFS_LITINO(mp)) return 0; /* rounded down */ offset = (XFS_LITINO(mp) - bytes) >> 3; if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; return (offset >= minforkoff) ? minforkoff : 0; } /* * If the requested numbers of bytes is smaller or equal to the * current attribute fork size we can always proceed. * * Note that if_bytes in the data fork might actually be larger than * the current data fork size is due to delalloc extents. In that * case either the extent count will go down when they are converted * to real extents, or the delalloc conversion will take care of the * literal area rebalancing. */ if (bytes <= xfs_inode_attr_fork_size(dp)) return dp->i_forkoff; /* * For attr2 we can try to move the forkoff if there is space in the * literal area, but for the old format we are done if there is no * space in the fixed attribute fork. */ if (!xfs_has_attr2(mp)) return 0; dsize = dp->i_df.if_bytes; switch (dp->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: /* * If there is no attr fork and the data fork is extents, * determine if creating the default attr fork will result * in the extents form migrating to btree. If so, the * minimum offset only needs to be the space required for * the btree root. */ if (!dp->i_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); break; case XFS_DINODE_FMT_BTREE: /* * If we have a data btree then keep forkoff if we have one, * otherwise we are adding a new attr, so then we set * minforkoff to where the btree root can finish so we have * plenty of room for attrs */ if (dp->i_forkoff) { if (offset < dp->i_forkoff) return 0; return dp->i_forkoff; } dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); break; } /* * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) return maxforkoff; if (offset >= minforkoff) return offset; return 0; } /* * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: * - noattr2 mount option is set, * - on-disk version bit says it is already set, or * - the attr2 mount option is not set to enable automatic upgrade from attr1. */ STATIC void xfs_sbversion_add_attr2( struct xfs_mount *mp, struct xfs_trans *tp) { if (xfs_has_noattr2(mp)) return; if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) return; if (!xfs_has_attr2(mp)) return; spin_lock(&mp->m_sb_lock); xfs_add_attr2(mp); spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); } /* * Create the initial contents of a shortform attribute list. */ void xfs_attr_shortform_create( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); ASSERT(ifp->if_bytes == 0); if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) ifp->if_format = XFS_DINODE_FMT_LOCAL; hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } /* * Return the entry if the attr in args is found, or NULL if not. */ struct xfs_attr_sf_entry * xfs_attr_sf_findname( struct xfs_da_args *args) { struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; for (sfe = xfs_attr_sf_firstentry(sf); sfe < xfs_attr_sf_endptr(sf); sfe = xfs_attr_sf_nextentry(sfe)) { if (xfs_attr_match(args, sfe->flags, sfe->nameval, sfe->namelen, &sfe->nameval[sfe->namelen], sfe->valuelen)) return sfe; } return NULL; } /* * Add a name/value pair to the shortform attribute list. * Overflow from the inode has already been checked for. */ void xfs_attr_shortform_add( struct xfs_da_args *args, int forkoff) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; int size; trace_xfs_attr_sf_add(args); dp->i_forkoff = forkoff; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(!xfs_attr_sf_findname(args)); size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK); sfe = xfs_attr_sf_endptr(sf); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->count++; be16_add_cpu(&sf->totsize, size); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); xfs_sbversion_add_attr2(mp, args->trans); } /* * After the last attribute is removed revert to original inode format, * making all literal area available to the data fork once more. */ void xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { ASSERT(ip->i_af.if_nextents == 0); xfs_ifork_zap_attr(ip); ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } /* * Remove an attribute from the shortform attribute list structure. */ int xfs_attr_sf_removename( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; uint16_t totsize = be16_to_cpu(sf->totsize); void *next, *end; int size = 0; trace_xfs_attr_sf_remove(args); sfe = xfs_attr_sf_findname(args); if (!sfe) { /* * If we are recovering an operation, finding nothing to remove * is not an error, it just means there was nothing to clean up. */ if (args->op_flags & XFS_DA_OP_RECOVERY) return 0; return -ENOATTR; } /* * Fix up the attribute fork data, covering the hole */ size = xfs_attr_sf_entsize(sfe); next = xfs_attr_sf_nextentry(sfe); end = xfs_attr_sf_endptr(sf); if (next < end) memmove(sfe, next, end - next); sf->count--; totsize -= size; sf->totsize = cpu_to_be16(totsize); /* * Fix up the start offset of the attribute fork */ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && !xfs_has_parent(mp)) { xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE || xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } xfs_sbversion_add_attr2(mp, args->trans); return 0; } /* * Retrieve the attribute value and length. * * If args->valuelen is zero, only the length needs to be returned. Unlike a * lookup, we only return an error if the attribute does not exist or we can't * retrieve the value. */ int xfs_attr_shortform_getvalue( struct xfs_da_args *args) { struct xfs_attr_sf_entry *sfe; ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); trace_xfs_attr_sf_lookup(args); sfe = xfs_attr_sf_findname(args); if (!sfe) return -ENOATTR; return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], sfe->valuelen); } /* Convert from using the shortform to the leaf format. */ int xfs_attr_shortform_to_leaf( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; int size = be16_to_cpu(sf->totsize); struct xfs_da_args nargs; char *tmpbuffer; int error, i; xfs_dablk_t blkno; struct xfs_buf *bp; trace_xfs_attr_sf_to_leaf(args); tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, ifp->if_data, size); sf = (struct xfs_attr_sf_hdr *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); bp = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) goto out; ASSERT(blkno == 0); error = xfs_attr3_leaf_create(args, blkno, &bp); if (error) goto out; memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; nargs.geo = args->geo; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; nargs.owner = args->owner; sfe = xfs_attr_sf_firstentry(sf); for (i = 0; i < sf->count; i++) { nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; if (!xfs_attr_check_namespace(sfe->flags)) { xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto out; } xfs_attr_sethash(&nargs); error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); ASSERT(error != -ENOSPC); if (error) goto out; sfe = xfs_attr_sf_nextentry(sfe); } error = 0; out: kfree(tmpbuffer); return error; } /* * Check a leaf attribute block to see if all the entries would fit into * a shortform attribute list. */ int xfs_attr_shortform_allfit( struct xfs_buf *bp, struct xfs_inode *dp) { struct xfs_attr_leafblock *leaf; struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; struct xfs_attr3_icleaf_hdr leafhdr; int bytes; int i; struct xfs_mount *mp = bp->b_mount; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); bytes = sizeof(struct xfs_attr_sf_hdr); for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) return 0; name_loc = xfs_attr3_leaf_name_local(leaf, i); if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return 0; bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } if (xfs_has_attr2(dp->i_mount) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); } /* Verify the consistency of a raw inline attribute fork. */ xfs_failaddr_t xfs_attr_shortform_verify( struct xfs_attr_sf_hdr *sfp, size_t size) { struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp); struct xfs_attr_sf_entry *next_sfep; char *endp; int i; /* * Give up if the attribute is way too short. */ if (size < sizeof(struct xfs_attr_sf_hdr)) return __this_address; endp = (char *)sfp + size; /* Check all reported entries */ for (i = 0; i < sfp->count; i++) { /* * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are * within the data buffer. * xfs_attr_sf_entry is defined with a 1-byte variable * array at the end, so we must subtract that off. */ if (((char *)sfep + sizeof(*sfep)) >= endp) return __this_address; /* Don't allow names with known bad length. */ if (sfep->namelen == 0) return __this_address; /* * Check that the variable-length part of the structure is * within the data buffer. The next entry starts after the * name component, so nextentry is an acceptable test. */ next_sfep = xfs_attr_sf_nextentry(sfep); if ((char *)next_sfep > endp) return __this_address; /* * Check for unknown flags. Short form doesn't support * the incomplete or local bits, so we can use the namespace * mask here. */ if (sfep->flags & ~XFS_ATTR_NSP_ONDISK_MASK) return __this_address; /* * Check for invalid namespace combinations. We only allow * one namespace flag per xattr, so we can just count the * bits (i.e. hweight) here. */ if (!xfs_attr_check_namespace(sfep->flags)) return __this_address; sfep = next_sfep; } if ((void *)sfep != (void *)endp) return __this_address; return NULL; } /* * Convert a leaf attribute list to shortform attribute list */ int xfs_attr3_leaf_to_shortform( struct xfs_buf *bp, struct xfs_da_args *args, int forkoff) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_da_args nargs; struct xfs_inode *dp = args->dp; char *tmpbuffer; int error; int i; trace_xfs_attr_leaf_to_sf(args); tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); if (!tmpbuffer) return -ENOMEM; memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); /* XXX (dgc): buffer is about to be marked stale - why zero it? */ memset(bp->b_addr, 0, args->geo->blksize); /* * Clean out the prior contents of the attribute list. */ error = xfs_da_shrink_inode(args, 0, bp); if (error) goto out; if (forkoff == -1) { /* * Don't remove the attr fork if this operation is the first * part of a attr replace operations. We're going to add a new * attr immediately, so we need to keep the attr fork around in * this case. */ if (!(args->op_flags & XFS_DA_OP_REPLACE)) { ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); } goto out; } xfs_attr_shortform_create(args); /* * Copy the attributes */ memset((char *)&nargs, 0, sizeof(nargs)); nargs.geo = args->geo; nargs.dp = dp; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; nargs.owner = args->owner; for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!entry->nameidx) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); name_loc = xfs_attr3_leaf_name_local(leaf, i); nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK; xfs_attr_shortform_add(&nargs, forkoff); } error = 0; out: kfree(tmpbuffer); return error; } /* * Convert from using a single leaf to a root node and a leaf. */ int xfs_attr3_leaf_to_node( struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr icleafhdr; struct xfs_attr_leaf_entry *entries; struct xfs_da3_icnode_hdr icnodehdr; struct xfs_da_intnode *node; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp1 = NULL; struct xfs_buf *bp2 = NULL; xfs_dablk_t blkno; int error; trace_xfs_attr_leaf_to_node(args); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { error = -EIO; goto out; } error = xfs_da_grow_inode(args, &blkno); if (error) goto out; error = xfs_attr3_leaf_read(args->trans, dp, args->owner, 0, &bp1); if (error) goto out; error = xfs_da_get_buf(args->trans, dp, blkno, &bp2, XFS_ATTR_FORK); if (error) goto out; /* * Copy leaf to new buffer and log it. */ xfs_da_buf_copy(bp2, bp1, args->geo->blksize); xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* * Set up the new root node. */ error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; node = bp1->b_addr; xfs_da3_node_hdr_from_disk(mp, &icnodehdr, node); leaf = bp2->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); /* both on-disk, don't endian-flip twice */ icnodehdr.btree[0].hashval = entries[icleafhdr.count - 1].hashval; icnodehdr.btree[0].before = cpu_to_be32(blkno); icnodehdr.count = 1; xfs_da3_node_hdr_to_disk(dp->i_mount, node, &icnodehdr); xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); error = 0; out: return error; } /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ /* * Create the initial contents of a leaf attribute list * or a leaf in a node attribute list. */ STATIC int xfs_attr3_leaf_create( struct xfs_da_args *args, xfs_dablk_t blkno, struct xfs_buf **bpp) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_buf *bp; int error; trace_xfs_attr_leaf_create(args); error = xfs_da_get_buf(args->trans, args->dp, blkno, &bp, XFS_ATTR_FORK); if (error) return error; bp->b_ops = &xfs_attr3_leaf_buf_ops; xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); leaf = bp->b_addr; memset(leaf, 0, args->geo->blksize); memset(&ichdr, 0, sizeof(ichdr)); ichdr.firstused = args->geo->blksize; if (xfs_has_crc(mp)) { struct xfs_da3_blkinfo *hdr3 = bp->b_addr; ichdr.magic = XFS_ATTR3_LEAF_MAGIC; hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(args->owner); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); } else { ichdr.magic = XFS_ATTR_LEAF_MAGIC; ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); } ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; return 0; } /* * Split the leaf node, rebalance, then add the new entry. */ int xfs_attr3_leaf_split( struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk) { xfs_dablk_t blkno; int error; trace_xfs_attr_leaf_split(state->args); /* * Allocate space for a new leaf node. */ ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); error = xfs_da_grow_inode(state->args, &blkno); if (error) return error; error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); if (error) return error; newblk->blkno = blkno; newblk->magic = XFS_ATTR_LEAF_MAGIC; /* * Rebalance the entries across the two leaves. * NOTE: rebalance() currently depends on the 2nd block being empty. */ xfs_attr3_leaf_rebalance(state, oldblk, newblk); error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return error; /* * Save info on "old" attribute for "atomic rename" ops, leaf_add() * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the * "new" attrs info. Will need the "old" info to remove it later. * * Insert the "new" entry in the correct block. */ if (state->inleaf) { trace_xfs_attr_leaf_add_old(state->args); error = xfs_attr3_leaf_add(oldblk->bp, state->args); } else { trace_xfs_attr_leaf_add_new(state->args); error = xfs_attr3_leaf_add(newblk->bp, state->args); } /* * Update last hashval in each block since we added the name. */ oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); return error; } /* * Add a name to the leaf attribute list structure. */ int xfs_attr3_leaf_add( struct xfs_buf *bp, struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; int tablesize; int entsize; int sum; int tmp; int i; trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args, NULL); /* * Search through freemap for first-fit on new name length. * (may need to figure in size of entry struct too) */ tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf); for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { if (tablesize > ichdr.firstused) { sum += ichdr.freemap[i].size; continue; } if (!ichdr.freemap[i].size) continue; /* no space in this map */ tmp = entsize; if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); if (ichdr.freemap[i].size >= tmp) { tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); goto out_log_hdr; } sum += ichdr.freemap[i].size; } /* * If there are no holes in the address space of the block, * and we don't have enough freespace, then compaction will do us * no good and we should just give up. */ if (!ichdr.holes && sum < entsize) return -ENOSPC; /* * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ xfs_attr3_leaf_compact(args, &ichdr, bp); /* * After compaction, the block is guaranteed to have only one * free region, in freemap[0]. If it is not big enough, give up. */ if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { tmp = -ENOSPC; goto out_log_hdr; } tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); return tmp; } /* * Add a name to a leaf attribute list structure. */ STATIC int xfs_attr3_leaf_add_work( struct xfs_buf *bp, struct xfs_attr3_icleaf_hdr *ichdr, struct xfs_da_args *args, int mapindex) { struct xfs_attr_leafblock *leaf; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; struct xfs_mount *mp; int tmp; int i; trace_xfs_attr_leaf_add_work(args); leaf = bp->b_addr; ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); ASSERT(args->index >= 0 && args->index <= ichdr->count); /* * Force open some space in the entry array and fill it in. */ entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (args->index < ichdr->count) { tmp = ichdr->count - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); memmove(entry + 1, entry, tmp); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } ichdr->count++; /* * Allocate space for the new string (at the end of the run). */ mp = args->trans->t_mountp; ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize); ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); ASSERT(ichdr->freemap[mapindex].size >= xfs_attr_leaf_newentsize(args, NULL)); ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize); ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp); entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); entry->flags = args->attr_filter; if (tmp) entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_REPLACE) { if (!(args->op_flags & XFS_DA_OP_LOGGED)) entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; } } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); ASSERT((args->index == ichdr->count - 1) || (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* * For "remote" attribute values, simply note that we need to * allocate space for the "remote" value. We can't actually * allocate the extents in this transaction, and we can't decide * which blocks they should be as we might allocate more blocks * as part of this transaction (a split operation for example). */ if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, args->index); name_loc->namelen = args->namelen; name_loc->valuelen = cpu_to_be16(args->valuelen); memcpy((char *)name_loc->nameval, args->name, args->namelen); memcpy((char *)&name_loc->nameval[args->namelen], args->value, be16_to_cpu(name_loc->valuelen)); } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->namelen = args->namelen; memcpy((char *)name_rmt->name, args->name, args->namelen); entry->flags |= XFS_ATTR_INCOMPLETE; /* just in case */ name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); /* * Update the control info for this leaf node */ if (be16_to_cpu(entry->nameidx) < ichdr->firstused) ichdr->firstused = be16_to_cpu(entry->nameidx); ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf)); tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr->freemap[i].base == tmp) { ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); ichdr->freemap[i].size -= min_t(uint16_t, ichdr->freemap[i].size, sizeof(xfs_attr_leaf_entry_t)); } } ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); return 0; } /* * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void xfs_attr3_leaf_compact( struct xfs_da_args *args, struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { struct xfs_attr_leafblock *leaf_src; struct xfs_attr_leafblock *leaf_dst; struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; char *tmpbuffer; trace_xfs_attr_leaf_compact(args); tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; leaf_dst = bp->b_addr; /* * Copy the on-disk header back into the destination buffer to ensure * all the information in the header that is not part of the incore * header structure is preserved. */ memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); /* Initialise the incore headers */ ichdr_src = *ichdr_dst; /* struct copy */ ichdr_dst->firstused = args->geo->blksize; ichdr_dst->usedbytes = 0; ichdr_dst->count = 0; ichdr_dst->holes = 0; ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); ichdr_dst->freemap[0].size = ichdr_dst->firstused - ichdr_dst->freemap[0].base; /* write the header back to initialise the underlying buffer */ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, ichdr_src.count); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. */ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); kfree(tmpbuffer); } /* * Compare two leaf blocks "order". * Return 0 unless leaf2 should go before leaf1. */ static int xfs_attr3_leaf_order( struct xfs_buf *leaf1_bp, struct xfs_attr3_icleaf_hdr *leaf1hdr, struct xfs_buf *leaf2_bp, struct xfs_attr3_icleaf_hdr *leaf2hdr) { struct xfs_attr_leaf_entry *entries1; struct xfs_attr_leaf_entry *entries2; entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && ((be32_to_cpu(entries2[0].hashval) < be32_to_cpu(entries1[0].hashval)) || (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { return 1; } return 0; } int xfs_attr_leaf_order( struct xfs_buf *leaf1_bp, struct xfs_buf *leaf2_bp) { struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; struct xfs_mount *mp = leaf1_bp->b_mount; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); } /* * Redistribute the attribute list entries between two leaf nodes, * taking into account the size of the new entry. * * NOTE: if new block is empty, then it will get the upper half of the * old block. At present, all (one) callers pass in an empty second block. * * This code adjusts the args->index/blkno and args->index2/blkno2 fields * to match what it is doing in splitting the attribute leaf block. Those * values are used in "atomic rename" operations on attributes. Note that * the "new" and "old" values can end up in different blocks. */ STATIC void xfs_attr3_leaf_rebalance( struct xfs_da_state *state, struct xfs_da_state_blk *blk1, struct xfs_da_state_blk *blk2) { struct xfs_da_args *args; struct xfs_attr_leafblock *leaf1; struct xfs_attr_leafblock *leaf2; struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; struct xfs_attr_leaf_entry *entries1; struct xfs_attr_leaf_entry *entries2; int count; int totallen; int max; int space; int swap; /* * Set up environment. */ ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2); ASSERT(ichdr2.count == 0); args = state->args; trace_xfs_attr_leaf_rebalance(args); /* * Check ordering of blocks, reverse if it makes things simpler. * * NOTE: Given that all (current) callers pass in an empty * second block, this code should never set "swap". */ swap = 0; if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { swap(blk1, blk2); /* swap structures rather than reconverting them */ swap(ichdr1, ichdr2); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; swap = 1; } /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. Then get * the direction to copy and the number of elements to move. * * "inleaf" is true if the new entry should be inserted into blk1. * If "swap" is also true, then reverse the sense of "inleaf". */ state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, blk2, &ichdr2, &count, &totallen); if (swap) state->inleaf = !state->inleaf; /* * Move any entries required from leaf to leaf: */ if (count < ichdr1.count) { /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ count = ichdr1.count - count; space = ichdr1.usedbytes - totallen; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf2 is the destination, compact it if it looks tight. */ max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. */ xfs_attr3_leaf_moveents(args, leaf1, &ichdr1, ichdr1.count - count, leaf2, &ichdr2, 0, count); } else if (count > ichdr1.count) { /* * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ count -= ichdr1.count; space = totallen - ichdr1.usedbytes; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf1 is the destination, compact it if it looks tight. */ max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); if (space > max) xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. */ xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1, ichdr1.count, count); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1); xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2); xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. */ entries1 = xfs_attr3_leaf_entryp(leaf1); entries2 = xfs_attr3_leaf_entryp(leaf2); blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); /* * Adjust the expected index for insertion. * NOTE: this code depends on the (current) situation that the * second block was originally empty. * * If the insertion point moved to the 2nd block, we must adjust * the index. We must also track the entry just following the * new entry for use in an "atomic rename" operation, that entry * is always the "old" entry and the "new" entry is what we are * inserting. The index/blkno fields refer to the "old" entry, * while the index2/blkno2 fields refer to the "new" entry. */ if (blk1->index > ichdr1.count) { ASSERT(state->inleaf == 0); blk2->index = blk1->index - ichdr1.count; args->index = args->index2 = blk2->index; args->blkno = args->blkno2 = blk2->blkno; } else if (blk1->index == ichdr1.count) { if (state->inleaf) { args->index = blk1->index; args->blkno = blk1->blkno; args->index2 = 0; args->blkno2 = blk2->blkno; } else { /* * On a double leaf split, the original attr location * is already stored in blkno2/index2, so don't * overwrite it overwise we corrupt the tree. */ blk2->index = blk1->index - ichdr1.count; args->index = blk2->index; args->blkno = blk2->blkno; if (!state->extravalid) { /* * set the new attr location to match the old * one and let the higher level split code * decide where in the leaf to place it. */ args->index2 = blk2->index; args->blkno2 = blk2->blkno; } } } else { ASSERT(state->inleaf == 1); args->index = args->index2 = blk1->index; args->blkno = args->blkno2 = blk1->blkno; } } /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. * GROT: Is this really necessary? With other than a 512 byte blocksize, * GROT: there will always be enough room in either block for a new entry. * GROT: Do a double-split for this case? */ STATIC int xfs_attr3_leaf_figure_balance( struct xfs_da_state *state, struct xfs_da_state_blk *blk1, struct xfs_attr3_icleaf_hdr *ichdr1, struct xfs_da_state_blk *blk2, struct xfs_attr3_icleaf_hdr *ichdr2, int *countarg, int *usedbytesarg) { struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; struct xfs_attr_leaf_entry *entry; int count; int max; int index; int totallen = 0; int half; int lastdelta; int foundit = 0; int tmp; /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. */ max = ichdr1->count + ichdr2->count; half = (max + 1) * sizeof(*entry); half += ichdr1->usedbytes + ichdr2->usedbytes + xfs_attr_leaf_newentsize(state->args, NULL); half /= 2; lastdelta = state->args->geo->blksize; entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { #define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) /* * The new entry is in the first block, account for it. */ if (count == blk1->index) { tmp = totallen + sizeof(*entry) + xfs_attr_leaf_newentsize(state->args, NULL); if (XFS_ATTR_ABS(half - tmp) > lastdelta) break; lastdelta = XFS_ATTR_ABS(half - tmp); totallen = tmp; foundit = 1; } /* * Wrap around into the second block if necessary. */ if (count == ichdr1->count) { leaf1 = leaf2; entry = xfs_attr3_leaf_entryp(leaf1); index = 0; } /* * Figure out if next leaf entry would be too much. */ tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1, index); if (XFS_ATTR_ABS(half - tmp) > lastdelta) break; lastdelta = XFS_ATTR_ABS(half - tmp); totallen = tmp; #undef XFS_ATTR_ABS } /* * Calculate the number of usedbytes that will end up in lower block. * If new entry not in lower block, fix up the count. */ totallen -= count * sizeof(*entry); if (foundit) { totallen -= sizeof(*entry) + xfs_attr_leaf_newentsize(state->args, NULL); } *countarg = count; *usedbytesarg = totallen; return foundit; } /*======================================================================== * Routines used for shrinking the Btree. *========================================================================*/ /* * Check a leaf block and its neighbors to see if the block should be * collapsed into one or the other neighbor. Always keep the block * with the smaller block number. * If the current block is over 50% full, don't try to join it, return 0. * If the block is empty, fill in the state structure and return 2. * If it can be collapsed, fill in the state structure and return 1. * If nothing can be done, return 0. * * GROT: allow for INCOMPLETE entries in calculation. */ int xfs_attr3_leaf_toosmall( struct xfs_da_state *state, int *action) { struct xfs_attr_leafblock *leaf; struct xfs_da_state_blk *blk; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_buf *bp; xfs_dablk_t blkno; int bytes; int forward; int error; int retval; int i; trace_xfs_attr_leaf_toosmall(state->args); /* * Check for the degenerate case of the block being over 50% full. * If so, it's not worth even looking to see if we might be able * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; leaf = blk->bp->b_addr; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf); bytes = xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + ichdr.usedbytes; if (bytes > (state->args->geo->blksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return 0; } /* * Check for the degenerate case of the block being empty. * If the block is empty, we'll simply delete it, no need to * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ if (ichdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ forward = (ichdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return error; if (retval) { *action = 0; } else { *action = 2; } return 0; } /* * Examine each sibling block to see if we can coalesce with * at least 25% free space to spare. We need to figure out * whether to merge with the forward or the backward block. * We prefer coalescing with the lower numbered sibling so as * to shrink an attribute list over time. */ /* start with smaller blk num */ forward = ichdr.forw < ichdr.back; for (i = 0; i < 2; forward = !forward, i++) { struct xfs_attr3_icleaf_hdr ichdr2; if (forward) blkno = ichdr.forw; else blkno = ichdr.back; if (blkno == 0) continue; error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, state->args->owner, blkno, &bp); if (error) return error; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr); bytes = state->args->geo->blksize - (state->args->geo->blksize >> 2) - ichdr.usedbytes - ichdr2.usedbytes - ((ichdr.count + ichdr2.count) * sizeof(xfs_attr_leaf_entry_t)) - xfs_attr3_leaf_hdr_size(leaf); xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ } if (i >= 2) { *action = 0; return 0; } /* * Make altpath point to the block we want to keep (the lower * numbered block) and path point to the block we want to drop. */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); } else { error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); } if (error) return error; if (retval) { *action = 0; } else { *action = 1; } return 0; } /* * Remove a name from the leaf attribute list structure. * * Return 1 if leaf is less than 37% full, 0 if >= 37% full. * If two leaves are 37% full, when combined they will leave 25% free. */ int xfs_attr3_leaf_remove( struct xfs_buf *bp, struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; int before; int after; int smallest; int entsize; int tablesize; int tmp; int i; trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); ASSERT(args->index >= 0 && args->index < ichdr.count); ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + xfs_attr3_leaf_hdr_size(leaf)); entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); /* * Scan through free region table: * check for adjacency of free'd entry with an existing one, * find smallest free region in case we need to replace it, * adjust any map that borders the entry table, */ tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf); tmp = ichdr.freemap[0].size; before = after = -1; smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { ASSERT(ichdr.freemap[i].base < args->geo->blksize); ASSERT(ichdr.freemap[i].size < args->geo->blksize); if (ichdr.freemap[i].base == tablesize) { ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); } if (ichdr.freemap[i].base + ichdr.freemap[i].size == be16_to_cpu(entry->nameidx)) { before = i; } else if (ichdr.freemap[i].base == (be16_to_cpu(entry->nameidx) + entsize)) { after = i; } else if (ichdr.freemap[i].size < tmp) { tmp = ichdr.freemap[i].size; smallest = i; } } /* * Coalesce adjacent freemap regions, * or replace the smallest region. */ if ((before >= 0) || (after >= 0)) { if ((before >= 0) && (after >= 0)) { ichdr.freemap[before].size += entsize; ichdr.freemap[before].size += ichdr.freemap[after].size; ichdr.freemap[after].base = 0; ichdr.freemap[after].size = 0; } else if (before >= 0) { ichdr.freemap[before].size += entsize; } else { ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); ichdr.freemap[after].size += entsize; } } else { /* * Replace smallest region (if it is smaller than free'd entry) */ if (ichdr.freemap[smallest].size < entsize) { ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); ichdr.freemap[smallest].size = entsize; } } /* * Did we remove the first entry? */ if (be16_to_cpu(entry->nameidx) == ichdr.firstused) smallest = 1; else smallest = 0; /* * Compress the remaining entries and zero out the removed stuff. */ memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); ichdr.usedbytes -= entsize; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), entsize)); tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); memmove(entry, entry + 1, tmp); ichdr.count--; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); /* * If we removed the first entry, re-find the first used byte * in the name area. Note that if the entry was the "firstused", * then we don't have a "hole" in our block resulting from * removing the name. */ if (smallest) { tmp = args->geo->blksize; entry = xfs_attr3_leaf_entryp(leaf); for (i = ichdr.count - 1; i >= 0; entry++, i--) { ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); } ichdr.firstused = tmp; ASSERT(ichdr.firstused != 0); } else { ichdr.holes = 1; /* mark as needing compaction */ } xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); /* * Check if leaf is less than 50% full, caller may want to * "join" the leaf with a sibling if so. */ tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t); return tmp < args->geo->magicpct; /* leaf is < 37% full */ } /* * Move all the attribute list entries from drop_leaf into save_leaf. */ void xfs_attr3_leaf_unbalance( struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk) { struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; struct xfs_attr3_icleaf_hdr drophdr; struct xfs_attr3_icleaf_hdr savehdr; struct xfs_attr_leaf_entry *entry; trace_xfs_attr_leaf_unbalance(state->args); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); /* * Save last hashval from dying block for later Btree fixup. */ drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); /* * Check if we need a temp buffer, or can we do it in place. * Note that we don't check "leaf" for holes because we will * always be dropping it, toosmall() decided that for us already. */ if (savehdr.holes == 0) { /* * dest leaf has no holes, so we add there. May need * to make some room in the entry array. */ if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(state->args, drop_leaf, &drophdr, 0, save_leaf, &savehdr, 0, drophdr.count); } else { xfs_attr3_leaf_moveents(state->args, drop_leaf, &drophdr, 0, save_leaf, &savehdr, savehdr.count, drophdr.count); } } else { /* * Destination has holes, so we make a temporary copy * of the leaf and add them both to that. */ struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; tmp_leaf = kzalloc(state->args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); /* * Copy the header into the temp leaf so that all the stuff * not in the incore header is present and gets copied back in * once we've moved all the entries. */ memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); memset(&tmphdr, 0, sizeof(tmphdr)); tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; tmphdr.firstused = state->args->geo->blksize; /* write the header to the temp buffer to initialise it */ xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr); if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(state->args, drop_leaf, &drophdr, 0, tmp_leaf, &tmphdr, 0, drophdr.count); xfs_attr3_leaf_moveents(state->args, save_leaf, &savehdr, 0, tmp_leaf, &tmphdr, tmphdr.count, savehdr.count); } else { xfs_attr3_leaf_moveents(state->args, save_leaf, &savehdr, 0, tmp_leaf, &tmphdr, 0, savehdr.count); xfs_attr3_leaf_moveents(state->args, drop_leaf, &drophdr, 0, tmp_leaf, &tmphdr, tmphdr.count, drophdr.count); } memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ kfree(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. */ entry = xfs_attr3_leaf_entryp(save_leaf); save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); } /*======================================================================== * Routines used for finding things in the Btree. *========================================================================*/ /* * Look up a name in a leaf attribute list structure. * This is the internal routine, it uses the caller's buffer. * * Note that duplicate keys are allowed, but only check within the * current leaf node. The Btree code must check in adjacent leaf nodes. * * Return in args->index the index into the entry[] array of either * the found entry, or where the entry should have been (insert before * that entry). * * Don't change the args->value unless we find the attribute. */ int xfs_attr3_leaf_lookup_int( struct xfs_buf *bp, struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; xfs_dahash_t hashval; int probe; int span; trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { xfs_buf_mark_corrupt(bp); xfs_da_mark_sick(args); return -EFSCORRUPTED; } /* * Binary search. (note: small blocks will skip this loop) */ hashval = args->hashval; probe = span = ichdr.count / 2; for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { span /= 2; if (be32_to_cpu(entry->hashval) < hashval) probe += span; else if (be32_to_cpu(entry->hashval) > hashval) probe -= span; else break; } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { xfs_buf_mark_corrupt(bp); xfs_da_mark_sick(args); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { xfs_buf_mark_corrupt(bp); xfs_da_mark_sick(args); return -EFSCORRUPTED; } /* * Since we may have duplicate hashval's, find the first matching * hashval in the leaf. */ while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { entry--; probe--; } while (probe < ichdr.count && be32_to_cpu(entry->hashval) < hashval) { entry++; probe++; } if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { args->index = probe; return -ENOATTR; } /* * Duplicate keys may be present, so search all of them for a match. */ for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); entry++, probe++) { /* * GROT: Add code to remove incomplete entries. */ if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); if (!xfs_attr_match(args, entry->flags, name_loc->nameval, name_loc->namelen, &name_loc->nameval[name_loc->namelen], be16_to_cpu(name_loc->valuelen))) continue; args->index = probe; return -EEXIST; } else { unsigned int valuelen; name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); valuelen = be32_to_cpu(name_rmt->valuelen); if (!xfs_attr_match(args, entry->flags, name_rmt->name, name_rmt->namelen, NULL, valuelen)) continue; args->index = probe; args->rmtvaluelen = valuelen; args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, args->rmtvaluelen); return -EEXIST; } } args->index = probe; return -ENOATTR; } /* * Get the value associated with an attribute name from a leaf attribute * list structure. * * If args->valuelen is zero, only the length needs to be returned. Unlike a * lookup, we only return an error if the attribute does not exist or we can't * retrieve the value. */ int xfs_attr3_leaf_getvalue( struct xfs_buf *bp, struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_local *name_loc; struct xfs_attr_leaf_name_remote *name_rmt; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count < args->geo->blksize / 8); ASSERT(args->index < ichdr.count); entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); return xfs_attr_copy_value(args, &name_loc->nameval[args->namelen], be16_to_cpu(name_loc->valuelen)); } name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, args->rmtvaluelen); return xfs_attr_copy_value(args, NULL, args->rmtvaluelen); } /*======================================================================== * Utility routines. *========================================================================*/ /* * Move the indicated entries from one leaf to another. * NOTE: this routine modifies both source and destination leaves. */ /*ARGSUSED*/ STATIC void xfs_attr3_leaf_moveents( struct xfs_da_args *args, struct xfs_attr_leafblock *leaf_s, struct xfs_attr3_icleaf_hdr *ichdr_s, int start_s, struct xfs_attr_leafblock *leaf_d, struct xfs_attr3_icleaf_hdr *ichdr_d, int start_d, int count) { struct xfs_attr_leaf_entry *entry_s; struct xfs_attr_leaf_entry *entry_d; int desti; int tmp; int i; /* * Check for nothing to do. */ if (count == 0) return; /* * Set up environment. */ ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); ASSERT(ichdr_s->magic == ichdr_d->magic); ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8); ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + xfs_attr3_leaf_hdr_size(leaf_s)); ASSERT(ichdr_d->count < args->geo->blksize / 8); ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + xfs_attr3_leaf_hdr_size(leaf_d)); ASSERT(start_s < ichdr_s->count); ASSERT(start_d <= ichdr_d->count); ASSERT(count <= ichdr_s->count); /* * Move the entries in the destination leaf up to make a hole? */ if (start_d < ichdr_d->count) { tmp = ichdr_d->count - start_d; tmp *= sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; memmove(entry_d, entry_s, tmp); } /* * Copy all entry's in the same (sorted) order, * but allocate attribute info packed and in sequence. */ entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; desti = start_d; for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); #ifdef GROT /* * Code to drop INCOMPLETE entries. Difficult to use as we * may also need to change the insertion index. Code turned * off for 6.2, should be revisited later. */ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); ichdr_s->usedbytes -= tmp; ichdr_s->count -= 1; entry_d--; /* to compensate for ++ in loop hdr */ desti--; if ((start_s + i) < offset) result++; /* insertion index adjustment */ } else { #endif /* GROT */ ichdr_d->firstused -= tmp; /* both on-disk, don't endian flip twice */ entry_d->hashval = entry_s->hashval; entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp <= args->geo->blksize); memmove(xfs_attr3_leaf_name(leaf_d, desti), xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp <= args->geo->blksize); memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); ichdr_s->usedbytes -= tmp; ichdr_d->usedbytes += tmp; ichdr_s->count -= 1; ichdr_d->count += 1; tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + xfs_attr3_leaf_hdr_size(leaf_d); ASSERT(ichdr_d->firstused >= tmp); #ifdef GROT } #endif /* GROT */ } /* * Zero out the entries we just copied. */ if (start_s == ichdr_s->count) { tmp = count * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + args->geo->blksize)); memset(entry_s, 0, tmp); } else { /* * Move the remaining entries down to fill the hole, * then zero the entries at the top. */ tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; memmove(entry_d, entry_s, tmp); tmp = count * sizeof(xfs_attr_leaf_entry_t); entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= ((char *)leaf_s + args->geo->blksize)); memset(entry_s, 0, tmp); } /* * Fill in the freemap information */ ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; ichdr_d->freemap[1].base = 0; ichdr_d->freemap[2].base = 0; ichdr_d->freemap[1].size = 0; ichdr_d->freemap[2].size = 0; ichdr_s->holes = 1; /* leaf may not be compact */ } /* * Pick up the last hashvalue from a leaf block. */ xfs_dahash_t xfs_attr_leaf_lasthash( struct xfs_buf *bp, int *count) { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; struct xfs_mount *mp = bp->b_mount; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) *count = ichdr.count; if (!ichdr.count) return 0; return be32_to_cpu(entries[ichdr.count - 1].hashval); } /* * Calculate the number of bytes used to store the indicated attribute * (whether local or remote only calculate bytes in this block). */ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) { struct xfs_attr_leaf_entry *entries; xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; int size; entries = xfs_attr3_leaf_entryp(leaf); if (entries[index].flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, index); size = xfs_attr_leaf_entsize_local(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, index); size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); } return size; } /* * Calculate the number of bytes that would be required to store the new * attribute (whether local or remote only calculate bytes in this block). * This routine decides as a side effect whether the attribute will be * a "local" or a "remote" attribute. */ int xfs_attr_leaf_newentsize( struct xfs_da_args *args, int *local) { int size; size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen); if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) { if (local) *local = 1; return size; } if (local) *local = 0; return xfs_attr_leaf_entsize_remote(args->namelen); } /*======================================================================== * Manage the INCOMPLETE flag in a leaf entry *========================================================================*/ /* * Clear the INCOMPLETE flag on an entry in a leaf block. */ int xfs_attr3_leaf_clearflag( struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; struct xfs_buf *bp; int error; #ifdef DEBUG struct xfs_attr3_icleaf_hdr ichdr; xfs_attr_leaf_name_local_t *name_loc; int namelen; char *name; #endif /* DEBUG */ trace_xfs_attr_leaf_clearflag(args); /* * Set up the operation. */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, args->blkno, &bp); if (error) return error; leaf = bp->b_addr; entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, args->index); namelen = name_loc->namelen; name = (char *)name_loc->nameval; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); namelen = name_rmt->namelen; name = (char *)name_rmt->name; } ASSERT(be32_to_cpu(entry->hashval) == args->hashval); ASSERT(namelen == args->namelen); ASSERT(memcmp(name, args->name, namelen) == 0); #endif /* DEBUG */ entry->flags &= ~XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if (args->rmtblkno) { ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } return 0; } /* * Set the INCOMPLETE flag on an entry in a leaf block. */ int xfs_attr3_leaf_setflag( struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; struct xfs_buf *bp; int error; #ifdef DEBUG struct xfs_attr3_icleaf_hdr ichdr; #endif trace_xfs_attr_leaf_setflag(args); /* * Set up the operation. */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, args->blkno, &bp); if (error) return error; leaf = bp->b_addr; #ifdef DEBUG xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); #endif entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } return 0; } /* * In a single transaction, clear the INCOMPLETE flag on the leaf entry * given by args->blkno/index and set the INCOMPLETE flag on the leaf * entry given by args->blkno2/index2. * * Note that they could be in different blocks, or in the same block. */ int xfs_attr3_leaf_flipflags( struct xfs_da_args *args) { struct xfs_attr_leafblock *leaf1; struct xfs_attr_leafblock *leaf2; struct xfs_attr_leaf_entry *entry1; struct xfs_attr_leaf_entry *entry2; struct xfs_attr_leaf_name_remote *name_rmt; struct xfs_buf *bp1; struct xfs_buf *bp2; int error; #ifdef DEBUG struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; xfs_attr_leaf_name_local_t *name_loc; int namelen1, namelen2; char *name1, *name2; #endif /* DEBUG */ trace_xfs_attr_leaf_flipflags(args); /* * Read the block containing the "old" attr */ error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, args->blkno, &bp1); if (error) return error; /* * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { error = xfs_attr3_leaf_read(args->trans, args->dp, args->owner, args->blkno2, &bp2); if (error) return error; } else { bp2 = bp1; } leaf1 = bp1->b_addr; entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; leaf2 = bp2->b_addr; entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1); ASSERT(args->index < ichdr1.count); ASSERT(args->index >= 0); xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2); ASSERT(args->index2 < ichdr2.count); ASSERT(args->index2 >= 0); if (entry1->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); namelen1 = name_loc->namelen; name1 = (char *)name_loc->nameval; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); namelen1 = name_rmt->namelen; name1 = (char *)name_rmt->name; } if (entry2->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); namelen2 = name_loc->namelen; name2 = (char *)name_loc->nameval; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); namelen2 = name_rmt->namelen; name2 = (char *)name_rmt->name; } ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval)); ASSERT(namelen1 == namelen2); ASSERT(memcmp(name1, name2, namelen1) == 0); #endif /* DEBUG */ ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE); ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); entry1->flags &= ~XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } entry2->flags |= XFS_ATTR_INCOMPLETE; xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } return 0; } |
| 2 42 5 4978 4902 111 7 1079 2 64 19 46 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> #include <linux/nospec.h> #include <linux/sched.h> #include <linux/thread_info.h> #include <asm/uaccess.h> /* * Architectures that support memory tagging (assigning tags to memory regions, * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. * * Passing down mm_struct allows to define untagging rules on per-process * basis. * * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) #endif #ifndef untagged_addr_remote #define untagged_addr_remote(mm, addr) ({ \ mmap_assert_locked(mm); \ untagged_addr(addr); \ }) #endif /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and * __copy_{to,from}_user{,_inatomic}(). * * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and * return the amount left to copy. They should assume that access_ok() has * already been checked (and succeeded); they should *not* zero-pad anything. * No KASAN or object size checks either - those belong here. * * Both of these functions should attempt to copy size bytes starting at from * into the area starting at to. They must not fetch or store anything * outside of those areas. Return value must be between 0 (everything * copied successfully) and size (nothing copied). * * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting * at to must become equal to the bytes fetched from the corresponding area * starting at from. All data past to + size - N must be left unmodified. * * If copying succeeds, the return value must be 0. If some data cannot be * fetched, it is permitted to copy less than had been fetched; the only * hard requirement is that not storing anything at all (i.e. returning size) * should happen only when nothing could be copied. In other words, you don't * have to squeeze as much as possible - it is allowed, but not necessary. * * For raw_copy_from_user() to always points to kernel memory and no faults * on store should happen. Interpretation of from is affected by set_fs(). * For raw_copy_to_user() it's the other way round. * * Both can be inlined - it's up to architectures whether it wants to bother * with that. They should not be used directly; they are used to implement * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic()) * that are used instead. Out of those, __... ones are inlined. Plain * copy_{to,from}_user() might or might not be inlined. If you want them * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER. * * NOTE: only copy_from_user() zero-pads the destination in case of short copy. * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything * at all; their callers absolutely must check the return value. * * Biarch ones should also provide raw_copy_in_user() - similar to the above, * but both source and destination are __user pointers (affected by set_fs() * as usual) and both source and destination can trigger faults. */ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { unsigned long res; instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res; might_fault(); instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } /** * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } /* * Architectures that #define INLINE_COPY_TO_USER use this function * directly in the normal copy_to/from_user(), the other ones go * through an extern _copy_to/from_user(), which expands the same code * here. * * Rust code always uses the extern definition. */ static inline __must_check unsigned long _inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { /* * Ensure that bad access_ok() speculation will not * lead to nasty side effects *after* the copy is * finished: */ barrier_nospec(); instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); return res; } extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); static inline __must_check unsigned long _inline_copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { if (!check_copy_size(to, n, false)) return n; #ifdef INLINE_COPY_FROM_USER return _inline_copy_from_user(to, from, n); #else return _copy_from_user(to, from, n); #endif } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { if (!check_copy_size(from, n, true)) return n; #ifdef INLINE_COPY_TO_USER return _inline_copy_to_user(to, from, n); #else return _copy_to_user(to, from, n); #endif } #ifndef copy_mc_to_kernel /* * Without arch opt-in this generic copy_mc_to_kernel() will not handle * #MC (or arch equivalent) during source read. */ static inline unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); return 0; } #endif static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; } static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; } /* * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * * User access methods will not sleep when called from a pagefault_disabled() * environment. */ static inline void pagefault_disable(void) { pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. */ barrier(); } static inline void pagefault_enable(void) { /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); pagefault_disabled_dec(); } /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ static inline bool pagefault_disabled(void) { return current->pagefault_disabled != 0; } /* * The pagefault handler is in general disabled by pagefault_disable() or * when in irq context (via in_atomic()). * * This function should only be used by the fault handlers. Other users should * stick to pagefault_disabled(). * Please NEVER use preempt_disable() to disable the fault handler. With * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) #ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS /** * probe_subpage_writeable: probe the user range for write faults at sub-page * granularity (e.g. arm64 MTE) * @uaddr: start of address range * @size: size of address range * * Returns 0 on success, the number of bytes not probed on fault. * * It is expected that the caller checked for the write permission of each * page in the range either by put_user() or GUP. The architecture port can * implement a more efficient get_user() probing if the same sub-page faults * are triggered by either a read or a write. */ static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) { return 0; } #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } #endif /* ARCH_HAS_NOCACHE_UACCESS */ extern __must_check int check_zeroed_user(const void __user *from, size_t size); /** * copy_struct_from_user: copy a struct from userspace * @dst: Destination address, in kernel space. This buffer must be @ksize * bytes long. * @ksize: Size of @dst struct. * @src: Source address, in userspace. * @usize: (Alleged) size of @src struct. * * Copies a struct from userspace to kernel space, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) * { * int err; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); * if (err) * return err; * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the userspace has passed an old struct to a * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) * are to be zero-filled. * * If @usize > @ksize, then the userspace has passed a new struct to an * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) * are checked to ensure they are zeroed, otherwise -E2BIG is returned. * * Returns (in all cases, some data may have been copied): * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_from_user(void *dst, size_t ksize, const void __user *src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { int ret = check_zeroed_user(src + size, rest); if (ret <= 0) return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ if (copy_from_user(dst, src, size)) return -EFAULT; return 0; } bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size); long copy_from_user_nofault(void *dst, const void __user *src, size_t size); long notrace copy_to_user_nofault(void __user *dst, const void *src, size_t size); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); #ifndef __get_kernel_nofault #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ type data; \ if (__get_user(data, p)) \ goto label; \ *(type *)dst = data; \ } while (0) #define __put_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(dst); \ type data = *(type *)src; \ if (__put_user(data, p)) \ goto label; \ } while (0) #endif /** * get_kernel_nofault(): safely attempt to read from a location * @val: read into this variable * @ptr: address to read from * * Returns 0 on success, or -EFAULT. */ #define get_kernel_nofault(val, ptr) ({ \ const typeof(val) *__gk_ptr = (ptr); \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) #ifndef user_access_begin #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) #define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end #endif #ifndef user_read_access_begin #define user_read_access_begin user_access_begin #define user_read_access_end user_access_end #endif #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); #endif #endif /* __LINUX_UACCESS_H__ */ |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 | /* * linux/drivers/video/console/bitblit.c -- BitBlitting Operation * * Originally from the 'accel_*' routines in drivers/video/console/fbcon.c * * Copyright (C) 2004 Antonino Daplas <adaplas @pol.net> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for * more details. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/fb.h> #include <linux/vt_kern.h> #include <linux/console.h> #include <asm/types.h> #include "fbcon.h" /* * Accelerated handlers. */ static void update_attr(u8 *dst, u8 *src, int attribute, struct vc_data *vc) { int i, offset = (vc->vc_font.height < 10) ? 1 : 2; int width = DIV_ROUND_UP(vc->vc_font.width, 8); unsigned int cellsize = vc->vc_font.height * width; u8 c; offset = cellsize - (offset * width); for (i = 0; i < cellsize; i++) { c = src[i]; if (attribute & FBCON_ATTRIBUTE_UNDERLINE && i >= offset) c = 0xff; if (attribute & FBCON_ATTRIBUTE_BOLD) c |= c >> 1; if (attribute & FBCON_ATTRIBUTE_REVERSE) c = ~c; dst[i] = c; } } static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy, int sx, int dy, int dx, int height, int width) { struct fb_copyarea area; area.sx = sx * vc->vc_font.width; area.sy = sy * vc->vc_font.height; area.dx = dx * vc->vc_font.width; area.dy = dy * vc->vc_font.height; area.height = height * vc->vc_font.height; area.width = width * vc->vc_font.width; info->fbops->fb_copyarea(info, &area); } static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy, int sx, int height, int width) { int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; struct fb_fillrect region; region.color = attr_bgcol_ec(bgshift, vc, info); region.dx = sx * vc->vc_font.width; region.dy = sy * vc->vc_font.height; region.width = width * vc->vc_font.width; region.height = height * vc->vc_font.height; region.rop = ROP_COPY; info->fbops->fb_fillrect(info, ®ion); } static inline void bit_putcs_aligned(struct vc_data *vc, struct fb_info *info, const u16 *s, u32 attr, u32 cnt, u32 d_pitch, u32 s_pitch, u32 cellsize, struct fb_image *image, u8 *buf, u8 *dst) { u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff; u32 idx = vc->vc_font.width >> 3; u8 *src; while (cnt--) { src = vc->vc_font.data + (scr_readw(s++)& charmask)*cellsize; if (attr) { update_attr(buf, src, attr, vc); src = buf; } if (likely(idx == 1)) __fb_pad_aligned_buffer(dst, d_pitch, src, idx, image->height); else fb_pad_aligned_buffer(dst, d_pitch, src, idx, image->height); dst += s_pitch; } info->fbops->fb_imageblit(info, image); } static inline void bit_putcs_unaligned(struct vc_data *vc, struct fb_info *info, const u16 *s, u32 attr, u32 cnt, u32 d_pitch, u32 s_pitch, u32 cellsize, struct fb_image *image, u8 *buf, u8 *dst) { u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff; u32 shift_low = 0, mod = vc->vc_font.width % 8; u32 shift_high = 8; u32 idx = vc->vc_font.width >> 3; u8 *src; while (cnt--) { src = vc->vc_font.data + (scr_readw(s++)& charmask)*cellsize; if (attr) { update_attr(buf, src, attr, vc); src = buf; } fb_pad_unaligned_buffer(dst, d_pitch, src, idx, image->height, shift_high, shift_low, mod); shift_low += mod; dst += (shift_low >= 8) ? s_pitch : s_pitch - 1; shift_low &= 7; shift_high = 8 - shift_low; } info->fbops->fb_imageblit(info, image); } static void bit_putcs(struct vc_data *vc, struct fb_info *info, const unsigned short *s, int count, int yy, int xx, int fg, int bg) { struct fb_image image; u32 width = DIV_ROUND_UP(vc->vc_font.width, 8); u32 cellsize = width * vc->vc_font.height; u32 maxcnt = info->pixmap.size/cellsize; u32 scan_align = info->pixmap.scan_align - 1; u32 buf_align = info->pixmap.buf_align - 1; u32 mod = vc->vc_font.width % 8, cnt, pitch, size; u32 attribute = get_attribute(info, scr_readw(s)); u8 *dst, *buf = NULL; image.fg_color = fg; image.bg_color = bg; image.dx = xx * vc->vc_font.width; image.dy = yy * vc->vc_font.height; image.height = vc->vc_font.height; image.depth = 1; if (attribute) { buf = kmalloc(cellsize, GFP_ATOMIC); if (!buf) return; } while (count) { if (count > maxcnt) cnt = maxcnt; else cnt = count; image.width = vc->vc_font.width * cnt; pitch = DIV_ROUND_UP(image.width, 8) + scan_align; pitch &= ~scan_align; size = pitch * image.height + buf_align; size &= ~buf_align; dst = fb_get_buffer_offset(info, &info->pixmap, size); image.data = dst; if (!mod) bit_putcs_aligned(vc, info, s, attribute, cnt, pitch, width, cellsize, &image, buf, dst); else bit_putcs_unaligned(vc, info, s, attribute, cnt, pitch, width, cellsize, &image, buf, dst); image.dx += cnt * vc->vc_font.width; count -= cnt; s += cnt; } /* buf is always NULL except when in monochrome mode, so in this case it's a gain to check buf against NULL even though kfree() handles NULL pointers just fine */ if (unlikely(buf)) kfree(buf); } static void bit_clear_margins(struct vc_data *vc, struct fb_info *info, int color, int bottom_only) { unsigned int cw = vc->vc_font.width; unsigned int ch = vc->vc_font.height; unsigned int rw = info->var.xres - (vc->vc_cols*cw); unsigned int bh = info->var.yres - (vc->vc_rows*ch); unsigned int rs = info->var.xres - rw; unsigned int bs = info->var.yres - bh; struct fb_fillrect region; region.color = color; region.rop = ROP_COPY; if ((int) rw > 0 && !bottom_only) { region.dx = info->var.xoffset + rs; region.dy = 0; region.width = rw; region.height = info->var.yres_virtual; info->fbops->fb_fillrect(info, ®ion); } if ((int) bh > 0) { region.dx = info->var.xoffset; region.dy = info->var.yoffset + bs; region.width = rs; region.height = bh; info->fbops->fb_fillrect(info, ®ion); } } static void bit_cursor(struct vc_data *vc, struct fb_info *info, bool enable, int fg, int bg) { struct fb_cursor cursor; struct fbcon_ops *ops = info->fbcon_par; unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff; int w = DIV_ROUND_UP(vc->vc_font.width, 8), c; int y = real_y(ops->p, vc->state.y); int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1; char *src; cursor.set = 0; if (!vc->vc_font.data) return; c = scr_readw((u16 *) vc->vc_pos); attribute = get_attribute(info, c); src = vc->vc_font.data + ((c & charmask) * (w * vc->vc_font.height)); if (ops->cursor_state.image.data != src || ops->cursor_reset) { ops->cursor_state.image.data = src; cursor.set |= FB_CUR_SETIMAGE; } if (attribute) { u8 *dst; dst = kmalloc_array(w, vc->vc_font.height, GFP_ATOMIC); if (!dst) return; kfree(ops->cursor_data); ops->cursor_data = dst; update_attr(dst, src, attribute, vc); src = dst; } if (ops->cursor_state.image.fg_color != fg || ops->cursor_state.image.bg_color != bg || ops->cursor_reset) { ops->cursor_state.image.fg_color = fg; ops->cursor_state.image.bg_color = bg; cursor.set |= FB_CUR_SETCMAP; } if ((ops->cursor_state.image.dx != (vc->vc_font.width * vc->state.x)) || (ops->cursor_state.image.dy != (vc->vc_font.height * y)) || ops->cursor_reset) { ops->cursor_state.image.dx = vc->vc_font.width * vc->state.x; ops->cursor_state.image.dy = vc->vc_font.height * y; cursor.set |= FB_CUR_SETPOS; } if (ops->cursor_state.image.height != vc->vc_font.height || ops->cursor_state.image.width != vc->vc_font.width || ops->cursor_reset) { ops->cursor_state.image.height = vc->vc_font.height; ops->cursor_state.image.width = vc->vc_font.width; cursor.set |= FB_CUR_SETSIZE; } if (ops->cursor_state.hot.x || ops->cursor_state.hot.y || ops->cursor_reset) { ops->cursor_state.hot.x = cursor.hot.y = 0; cursor.set |= FB_CUR_SETHOT; } if (cursor.set & FB_CUR_SETSIZE || vc->vc_cursor_type != ops->p->cursor_shape || ops->cursor_state.mask == NULL || ops->cursor_reset) { char *mask = kmalloc_array(w, vc->vc_font.height, GFP_ATOMIC); int cur_height, size, i = 0; u8 msk = 0xff; if (!mask) return; kfree(ops->cursor_state.mask); ops->cursor_state.mask = mask; ops->p->cursor_shape = vc->vc_cursor_type; cursor.set |= FB_CUR_SETSHAPE; switch (CUR_SIZE(ops->p->cursor_shape)) { case CUR_NONE: cur_height = 0; break; case CUR_UNDERLINE: cur_height = (vc->vc_font.height < 10) ? 1 : 2; break; case CUR_LOWER_THIRD: cur_height = vc->vc_font.height/3; break; case CUR_LOWER_HALF: cur_height = vc->vc_font.height >> 1; break; case CUR_TWO_THIRDS: cur_height = (vc->vc_font.height << 1)/3; break; case CUR_BLOCK: default: cur_height = vc->vc_font.height; break; } size = (vc->vc_font.height - cur_height) * w; while (size--) mask[i++] = ~msk; size = cur_height * w; while (size--) mask[i++] = msk; } ops->cursor_state.enable = enable && !use_sw; cursor.image.data = src; cursor.image.fg_color = ops->cursor_state.image.fg_color; cursor.image.bg_color = ops->cursor_state.image.bg_color; cursor.image.dx = ops->cursor_state.image.dx; cursor.image.dy = ops->cursor_state.image.dy; cursor.image.height = ops->cursor_state.image.height; cursor.image.width = ops->cursor_state.image.width; cursor.hot.x = ops->cursor_state.hot.x; cursor.hot.y = ops->cursor_state.hot.y; cursor.mask = ops->cursor_state.mask; cursor.enable = ops->cursor_state.enable; cursor.image.depth = 1; cursor.rop = ROP_XOR; if (info->fbops->fb_cursor) err = info->fbops->fb_cursor(info, &cursor); if (err) soft_cursor(info, &cursor); ops->cursor_reset = 0; } static int bit_update_start(struct fb_info *info) { struct fbcon_ops *ops = info->fbcon_par; int err; err = fb_pan_display(info, &ops->var); ops->var.xoffset = info->var.xoffset; ops->var.yoffset = info->var.yoffset; ops->var.vmode = info->var.vmode; return err; } void fbcon_set_bitops(struct fbcon_ops *ops) { ops->bmove = bit_bmove; ops->clear = bit_clear; ops->putcs = bit_putcs; ops->clear_margins = bit_clear_margins; ops->cursor = bit_cursor; ops->update_start = bit_update_start; ops->rotate_font = NULL; if (ops->rotate) fbcon_set_rotate(ops); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 | /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004, 2020 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. */ #ifndef IB_VERBS_H #define IB_VERBS_H #include <linux/ethtool.h> #include <linux/types.h> #include <linux/device.h> #include <linux/dma-mapping.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rwsem.h> #include <linux/workqueue.h> #include <linux/irq_poll.h> #include <uapi/linux/if_ether.h> #include <net/ipv6.h> #include <net/ip.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/refcount.h> #include <linux/if_link.h> #include <linux/atomic.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/cgroup_rdma.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/dim.h> #include <uapi/rdma/ib_user_verbs.h> #include <rdma/rdma_counter.h> #include <rdma/restrack.h> #include <rdma/signature.h> #include <uapi/rdma/rdma_user_ioctl.h> #include <uapi/rdma/ib_user_ioctl_verbs.h> #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN struct ib_umem_odp; struct ib_uqp_object; struct ib_usrq_object; struct ib_uwq_object; struct rdma_cm_id; struct ib_port; struct hw_stats_device_data; extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; extern struct workqueue_struct *ib_comp_unbound_wq; struct ib_ucq_object; __printf(3, 4) __cold void ibdev_printk(const char *level, const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_alert(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_crit(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_err(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_warn(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else __printf(2, 3) __cold static inline void ibdev_dbg(const struct ib_device *ibdev, const char *format, ...) {} #endif #define ibdev_level_ratelimited(ibdev_level, ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ ibdev_level(ibdev, fmt, ##__VA_ARGS__); \ } while (0) #define ibdev_emerg_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_emerg, ibdev, fmt, ##__VA_ARGS__) #define ibdev_alert_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_alert, ibdev, fmt, ##__VA_ARGS__) #define ibdev_crit_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_crit, ibdev, fmt, ##__VA_ARGS__) #define ibdev_err_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_err, ibdev, fmt, ##__VA_ARGS__) #define ibdev_warn_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_warn, ibdev, fmt, ##__VA_ARGS__) #define ibdev_notice_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_notice, ibdev, fmt, ##__VA_ARGS__) #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && __ratelimit(&_rs)) \ __dynamic_ibdev_dbg(&descriptor, ibdev, fmt, \ ##__VA_ARGS__); \ } while (0) #else __printf(2, 3) __cold static inline void ibdev_dbg_ratelimited(const struct ib_device *ibdev, const char *format, ...) {} #endif union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; extern union ib_gid zgid; enum ib_gid_type { IB_GID_TYPE_IB = IB_UVERBS_GID_TYPE_IB, IB_GID_TYPE_ROCE = IB_UVERBS_GID_TYPE_ROCE_V1, IB_GID_TYPE_ROCE_UDP_ENCAP = IB_UVERBS_GID_TYPE_ROCE_V2, IB_GID_TYPE_SIZE }; #define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { struct net_device __rcu *ndev; struct ib_device *device; union ib_gid gid; enum ib_gid_type gid_type; u16 index; u32 port_num; }; enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, RDMA_TRANSPORT_USNIC, RDMA_TRANSPORT_USNIC_UDP, RDMA_TRANSPORT_UNSPECIFIED, }; enum rdma_protocol_type { RDMA_PROTOCOL_IB, RDMA_PROTOCOL_IBOE, RDMA_PROTOCOL_IWARP, RDMA_PROTOCOL_USNIC_UDP }; __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type); enum rdma_network_type { RDMA_NETWORK_IB, RDMA_NETWORK_ROCE_V1, RDMA_NETWORK_IPV4, RDMA_NETWORK_IPV6 }; static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) { if (network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) return IB_GID_TYPE_ROCE_UDP_ENCAP; else if (network_type == RDMA_NETWORK_ROCE_V1) return IB_GID_TYPE_ROCE; else return IB_GID_TYPE_IB; } static inline enum rdma_network_type rdma_gid_attr_network_type(const struct ib_gid_attr *attr) { if (attr->gid_type == IB_GID_TYPE_IB) return RDMA_NETWORK_IB; if (attr->gid_type == IB_GID_TYPE_ROCE) return RDMA_NETWORK_ROCE_V1; if (ipv6_addr_v4mapped((struct in6_addr *)&attr->gid)) return RDMA_NETWORK_IPV4; else return RDMA_NETWORK_IPV6; } enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, }; enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = IB_UVERBS_DEVICE_RESIZE_MAX_WR, IB_DEVICE_BAD_PKEY_CNTR = IB_UVERBS_DEVICE_BAD_PKEY_CNTR, IB_DEVICE_BAD_QKEY_CNTR = IB_UVERBS_DEVICE_BAD_QKEY_CNTR, IB_DEVICE_RAW_MULTI = IB_UVERBS_DEVICE_RAW_MULTI, IB_DEVICE_AUTO_PATH_MIG = IB_UVERBS_DEVICE_AUTO_PATH_MIG, IB_DEVICE_CHANGE_PHY_PORT = IB_UVERBS_DEVICE_CHANGE_PHY_PORT, IB_DEVICE_UD_AV_PORT_ENFORCE = IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE, IB_DEVICE_CURR_QP_STATE_MOD = IB_UVERBS_DEVICE_CURR_QP_STATE_MOD, IB_DEVICE_SHUTDOWN_PORT = IB_UVERBS_DEVICE_SHUTDOWN_PORT, /* IB_DEVICE_INIT_TYPE = IB_UVERBS_DEVICE_INIT_TYPE, (not in use) */ IB_DEVICE_PORT_ACTIVE_EVENT = IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT, IB_DEVICE_SYS_IMAGE_GUID = IB_UVERBS_DEVICE_SYS_IMAGE_GUID, IB_DEVICE_RC_RNR_NAK_GEN = IB_UVERBS_DEVICE_RC_RNR_NAK_GEN, IB_DEVICE_SRQ_RESIZE = IB_UVERBS_DEVICE_SRQ_RESIZE, IB_DEVICE_N_NOTIFY_CQ = IB_UVERBS_DEVICE_N_NOTIFY_CQ, /* Reserved, old SEND_W_INV = 1 << 16,*/ IB_DEVICE_MEM_WINDOW = IB_UVERBS_DEVICE_MEM_WINDOW, /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ IB_DEVICE_UD_IP_CSUM = IB_UVERBS_DEVICE_UD_IP_CSUM, IB_DEVICE_XRC = IB_UVERBS_DEVICE_XRC, /* * This device supports the IB "base memory management extension", * which includes support for fast registrations (IB_WR_REG_MR, * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should * also be set by any iWarp device which must support FRs to comply * to the iWarp verbs spec. iWarp devices also support the * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the * stag. */ IB_DEVICE_MEM_MGT_EXTENSIONS = IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS, IB_DEVICE_MEM_WINDOW_TYPE_2A = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A, IB_DEVICE_MEM_WINDOW_TYPE_2B = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B, IB_DEVICE_RC_IP_CSUM = IB_UVERBS_DEVICE_RC_IP_CSUM, /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */ IB_DEVICE_RAW_IP_CSUM = IB_UVERBS_DEVICE_RAW_IP_CSUM, IB_DEVICE_MANAGED_FLOW_STEERING = IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING, /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */ IB_DEVICE_RAW_SCATTER_FCS = IB_UVERBS_DEVICE_RAW_SCATTER_FCS, /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, /* Placement type attributes */ IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL, IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT, IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; enum ib_kernel_cap_flags { /* * This device supports a per-device lkey or stag that can be * used without performing a memory registration for the local * memory. Note that ULPs should never check this flag, but * instead of use the local_dma_lkey flag in the ib_pd structure, * which will always contain a usable lkey. */ IBK_LOCAL_DMA_LKEY = 1 << 0, /* IB_QP_CREATE_INTEGRITY_EN is supported to implement T10-PI */ IBK_INTEGRITY_HANDOVER = 1 << 1, /* IB_ACCESS_ON_DEMAND is supported during reg_user_mr() */ IBK_ON_DEMAND_PAGING = 1 << 2, /* IB_MR_TYPE_SG_GAPS is supported */ IBK_SG_GAPS_REG = 1 << 3, /* Driver supports RDMA_NLDEV_CMD_DELLINK */ IBK_ALLOW_USER_UNREG = 1 << 4, /* ipoib will use IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK */ IBK_BLOCK_MULTICAST_LOOPBACK = 1 << 5, /* iopib will use IB_QP_CREATE_IPOIB_UD_LSO for its QPs */ IBK_UD_TSO = 1 << 6, /* iopib will use the device ops: * get_vf_config * get_vf_guid * get_vf_stats * set_vf_guid * set_vf_link_state */ IBK_VIRTUAL_FUNCTION = 1 << 7, /* ipoib will use IB_QP_CREATE_NETDEV_USE for its QPs */ IBK_RDMA_NETDEV_OPA = 1 << 8, }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; enum ib_odp_general_cap_bits { IB_ODP_SUPPORT = 1 << 0, IB_ODP_SUPPORT_IMPLICIT = 1 << 1, }; enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_SEND = 1 << 0, IB_ODP_SUPPORT_RECV = 1 << 1, IB_ODP_SUPPORT_WRITE = 1 << 2, IB_ODP_SUPPORT_READ = 1 << 3, IB_ODP_SUPPORT_ATOMIC = 1 << 4, IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, }; struct ib_odp_caps { uint64_t general_caps; struct { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; uint32_t xrc_odp_caps; } per_transport_caps; }; struct ib_rss_caps { /* Corresponding bit will be set if qp type from * 'enum ib_qp_type' is supported, e.g. * supported_qpts |= 1 << IB_QPT_UD */ u32 supported_qpts; u32 max_rwq_indirection_tables; u32 max_rwq_indirection_table_size; }; enum ib_tm_cap_flags { /* Support tag matching with rendezvous offload for RC transport */ IB_TM_CAP_RNDV_RC = 1 << 0, }; struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ u32 max_num_tags; /* From enum ib_tm_cap_flags */ u32 flags; /* Max number of outstanding list operations */ u32 max_ops; /* Max number of SGE in tag matching entry */ u32 max_sge; }; struct ib_cq_init_attr { unsigned int cqe; u32 comp_vector; u32 flags; }; enum ib_cq_attr_mask { IB_CQ_MODERATE = 1 << 0, }; struct ib_cq_caps { u16 max_cq_moderation_count; u16 max_cq_moderation_period; }; struct ib_dm_mr_attr { u64 length; u64 offset; u32 access_flags; }; struct ib_dm_alloc_attr { u64 length; u32 alignment; u32 flags; }; struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; u64 device_cap_flags; u64 kernel_cap_flags; int max_send_sge; int max_recv_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; unsigned int max_pi_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; struct ib_odp_caps odp_caps; uint64_t timestamp_mask; uint64_t hca_core_clock; /* in KHZ */ struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ struct ib_tm_caps tm_caps; struct ib_cq_caps cq_caps; u64 max_dm_size; /* Max entries for sgl for optimized performance per READ */ u32 max_sgl_rd; }; enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; enum opa_mtu { OPA_MTU_8192 = 6, OPA_MTU_10240 = 7 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } static inline enum ib_mtu ib_mtu_int_to_enum(int mtu) { if (mtu >= 4096) return IB_MTU_4096; else if (mtu >= 2048) return IB_MTU_2048; else if (mtu >= 1024) return IB_MTU_1024; else if (mtu >= 512) return IB_MTU_512; else return IB_MTU_256; } static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) { switch (mtu) { case OPA_MTU_8192: return 8192; case OPA_MTU_10240: return 10240; default: return(ib_mtu_enum_to_int((enum ib_mtu)mtu)); } } static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) { if (mtu >= 10240) return OPA_MTU_10240; else if (mtu >= 8192) return OPA_MTU_8192; else return ((enum opa_mtu)ib_mtu_int_to_enum(mtu)); } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5 }; enum ib_port_phys_state { IB_PORT_PHYS_STATE_SLEEP = 1, IB_PORT_PHYS_STATE_POLLING = 2, IB_PORT_PHYS_STATE_DISABLED = 3, IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4, IB_PORT_PHYS_STATE_LINK_UP = 5, IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6, IB_PORT_PHYS_STATE_PHY_TEST = 7, }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_2X = 16, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_2X: return 2; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32, IB_SPEED_HDR = 64, IB_SPEED_NDR = 128, IB_SPEED_XDR = 256, }; enum ib_stat_flag { IB_STAT_FLAG_OPTIONAL = 1 << 0, }; /** * struct rdma_stat_desc * @name - The name of the counter * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL * @priv - Driver private information; Core code should not use */ struct rdma_stat_desc { const char *name; unsigned int flags; const void *priv; }; /** * struct rdma_hw_stats * @lock - Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. * @timestamp - Used by the core code to track when the last update was * @lifespan - Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. * @descs - Array of pointers to static descriptors used for the counters * in directory. * @is_disabled - A bitmap to indicate each counter is currently disabled * or not. * @num_counters - How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. * @value - Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { struct mutex lock; /* Protect lifespan and values[] */ unsigned long timestamp; unsigned long lifespan; const struct rdma_stat_desc *descs; unsigned long *is_disabled; int num_counters; u64 value[] __counted_by(num_counters); }; #define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const struct rdma_stat_desc *descs, int num_counters, unsigned long lifespan); void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats); /* Define bits for the various functionality this port needs to be supported by * the core. */ /* Management 0x00000FFF */ #define RDMA_CORE_CAP_IB_MAD 0x00000001 #define RDMA_CORE_CAP_IB_SMI 0x00000002 #define RDMA_CORE_CAP_IB_CM 0x00000004 #define RDMA_CORE_CAP_IW_CM 0x00000008 #define RDMA_CORE_CAP_IB_SA 0x00000010 #define RDMA_CORE_CAP_OPA_MAD 0x00000020 /* Address format 0x000FF000 */ #define RDMA_CORE_CAP_AF_IB 0x00001000 #define RDMA_CORE_CAP_ETH_AH 0x00002000 #define RDMA_CORE_CAP_OPA_AH 0x00004000 #define RDMA_CORE_CAP_IB_GRH_REQUIRED 0x00008000 /* Protocol 0xFFF00000 */ #define RDMA_CORE_CAP_PROT_IB 0x00100000 #define RDMA_CORE_CAP_PROT_ROCE 0x00200000 #define RDMA_CORE_CAP_PROT_IWARP 0x00400000 #define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 #define RDMA_CORE_CAP_PROT_RAW_PACKET 0x01000000 #define RDMA_CORE_CAP_PROT_USNIC 0x02000000 #define RDMA_CORE_PORT_IB_GRH_REQUIRED (RDMA_CORE_CAP_IB_GRH_REQUIRED \ | RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP) #define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_SMI \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_IB_SA \ | RDMA_CORE_CAP_AF_IB) #define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ | RDMA_CORE_CAP_IW_CM) #define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ | RDMA_CORE_CAP_OPA_MAD) #define RDMA_CORE_PORT_RAW_PACKET (RDMA_CORE_CAP_PROT_RAW_PACKET) #define RDMA_CORE_PORT_USNIC (RDMA_CORE_CAP_PROT_USNIC) struct ib_port_attr { u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; u32 phys_mtu; int gid_tbl_len; unsigned int ip_gids:1; /* This is the value from PortInfo CapabilityMask, defined by IBA */ u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u32 sm_lid; u32 lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u16 active_speed; u8 phys_state; u16 port_cap_flags2; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; #define IB_DEVICE_NODE_DESC_MAX 64 struct ib_device_modify { u64 sys_image_guid; char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3), IB_PORT_OPA_MASK_CHG = (1<<4) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq |